https://github.com/phoebewang created https://github.com/llvm/llvm-project/pull/115581
Resolve compile fail without SSE2. >From 54b3ba2bc49e04293e8543f37dae9b8f362c04e4 Mon Sep 17 00:00:00 2001 From: "Wang, Phoebe" <phoebe.w...@intel.com> Date: Sat, 9 Nov 2024 12:48:19 +0800 Subject: [PATCH 1/2] Reland "[X86][AMX] Support AMX-AVX512" Resolve compile fail without SSE2. --- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/Basic/BuiltinsX86_64.def | 13 + clang/include/clang/Driver/Options.td | 2 + clang/lib/Basic/Targets/X86.cpp | 6 + clang/lib/Basic/Targets/X86.h | 1 + clang/lib/Headers/CMakeLists.txt | 1 + clang/lib/Headers/amxavx512intrin.h | 382 ++++++++++++++++++ clang/lib/Headers/immintrin.h | 4 + clang/lib/Sema/SemaX86.cpp | 6 + clang/test/CodeGen/X86/amx_avx512_api.c | 52 +++ clang/test/CodeGen/X86/amxavx512-builtins.c | 41 ++ clang/test/CodeGen/attr-target-x86.c | 8 +- clang/test/Driver/x86-target-features.c | 7 + clang/test/Preprocessor/x86_target_features.c | 12 + llvm/include/llvm/IR/IntrinsicsX86.td | 51 +++ .../llvm/TargetParser/X86TargetParser.def | 1 + llvm/lib/Target/X86/X86.td | 4 + llvm/lib/Target/X86/X86ExpandPseudo.cpp | 67 ++- llvm/lib/Target/X86/X86ISelLowering.cpp | 76 ++++ llvm/lib/Target/X86/X86InstrAMX.td | 147 +++++++ llvm/lib/Target/X86/X86InstrPredicates.td | 1 + llvm/lib/Target/X86/X86LowerAMXType.cpp | 11 + llvm/lib/Target/X86/X86PreTileConfig.cpp | 18 +- llvm/lib/TargetParser/Host.cpp | 1 + llvm/lib/TargetParser/X86TargetParser.cpp | 2 + .../CodeGen/X86/amx-across-func-tilemovrow.ll | 171 ++++++++ .../test/CodeGen/X86/amx-avx512-intrinsics.ll | 116 ++++++ .../CodeGen/X86/amx-tile-avx512-internals.ll | 61 +++ llvm/test/MC/Disassembler/X86/amx-avx512.txt | 106 +++++ llvm/test/MC/X86/amx-avx512-att.s | 105 +++++ llvm/test/MC/X86/amx-avx512-intel.s | 105 +++++ 31 files changed, 1567 insertions(+), 12 deletions(-) create mode 100644 clang/lib/Headers/amxavx512intrin.h create mode 100644 clang/test/CodeGen/X86/amx_avx512_api.c create mode 100644 clang/test/CodeGen/X86/amxavx512-builtins.c create mode 100644 llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll create mode 100644 llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll create mode 100644 llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll create mode 100644 llvm/test/MC/Disassembler/X86/amx-avx512.txt create mode 100644 llvm/test/MC/X86/amx-avx512-att.s create mode 100644 llvm/test/MC/X86/amx-avx512-intel.s diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index f82fbb73b12162..c3424e0e6f34c9 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -739,6 +739,7 @@ X86 Support * Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``. - Support ISA of ``AMX-FP8``. - Support ISA of ``AMX-TRANSPOSE``. +- Support ISA of ``AMX-AVX512``. Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index d95e8455a304b6..9f7462b1e0d962 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -133,6 +133,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh_internal, "V32xUsUsV256iUi", "n", "amx-avx512,avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, "V32xUsUsV256iUi", "n", "amx-avx512,avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", "amx-avx512,avx10.2-512") // AMX TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") @@ -159,6 +165,13 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose") TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps, "V16fIUcUi", "n", "amx-avx512,avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h, "V32yIUcUi", "n", "amx-avx512,avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l, "V32yIUcUi", "n", "amx-avx512,avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh, "V32xIUcUi", "n", "amx-avx512,avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl, "V32xIUcUi", "n", "amx-avx512,avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_tilemovrow, "V16iIUcUi", "n", "amx-avx512,avx10.2-512") + TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi") TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd") TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", "cmpccxadd") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 8887e0c1495d2a..0dba5672c5a85d 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6285,6 +6285,8 @@ def mno_80387 : Flag<["-"], "mno-80387">, Alias<mno_x87>; def mno_fp_ret_in_387 : Flag<["-"], "mno-fp-ret-in-387">, Alias<mno_x87>; def mmmx : Flag<["-"], "mmmx">, Group<m_x86_Features_Group>; def mno_mmx : Flag<["-"], "mno-mmx">, Group<m_x86_Features_Group>; +def mamx_avx512 : Flag<["-"], "mamx-avx512">, Group<m_x86_Features_Group>; +def mno_amx_avx512 : Flag<["-"], "mno-amx-avx512">, Group<m_x86_Features_Group>; def mamx_bf16 : Flag<["-"], "mamx-bf16">, Group<m_x86_Features_Group>; def mno_amx_bf16 : Flag<["-"], "mno-amx-bf16">, Group<m_x86_Features_Group>; def mamx_complex : Flag<["-"], "mamx-complex">, Group<m_x86_Features_Group>; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index d7d3adef42c79a..3c3dbfa13e452b 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -432,6 +432,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features, HasAMXFP8 = true; } else if (Feature == "+amx-transpose") { HasAMXTRANSPOSE = true; + } else if (Feature == "+amx-avx512") { + HasAMXAVX512 = true; } else if (Feature == "+cmpccxadd") { HasCMPCCXADD = true; } else if (Feature == "+raoint") { @@ -955,6 +957,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AMX_FP8__"); if (HasAMXTRANSPOSE) Builder.defineMacro("__AMX_TRANSPOSE__"); + if (HasAMXAVX512) + Builder.defineMacro("__AMX_AVX512__"); if (HasCMPCCXADD) Builder.defineMacro("__CMPCCXADD__"); if (HasRAOINT) @@ -1080,6 +1084,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { return llvm::StringSwitch<bool>(Name) .Case("adx", true) .Case("aes", true) + .Case("amx-avx512", true) .Case("amx-bf16", true) .Case("amx-complex", true) .Case("amx-fp16", true) @@ -1200,6 +1205,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { return llvm::StringSwitch<bool>(Feature) .Case("adx", HasADX) .Case("aes", HasAES) + .Case("amx-avx512", HasAMXAVX512) .Case("amx-bf16", HasAMXBF16) .Case("amx-complex", HasAMXCOMPLEX) .Case("amx-fp16", HasAMXFP16) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index e2eba63b992355..70047731b17295 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -159,6 +159,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAMXCOMPLEX = false; bool HasAMXFP8 = false; bool HasAMXTRANSPOSE = false; + bool HasAMXAVX512 = false; bool HasSERIALIZE = false; bool HasTSXLDTRK = false; bool HasUSERMSR = false; diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 67242cd4d981bc..76366ca1f108e9 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -146,6 +146,7 @@ set(x86_files adcintrin.h adxintrin.h ammintrin.h + amxavx512intrin.h amxcomplexintrin.h amxfp16intrin.h amxfp8intrin.h diff --git a/clang/lib/Headers/amxavx512intrin.h b/clang/lib/Headers/amxavx512intrin.h new file mode 100644 index 00000000000000..945edea543e706 --- /dev/null +++ b/clang/lib/Headers/amxavx512intrin.h @@ -0,0 +1,382 @@ +/*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead." +#endif // __IMMINTRIN_H + +#ifndef __AMX_AVX512INTRIN_H +#define __AMX_AVX512INTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_AVX512 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-avx512,avx10.2-512"))) + +/// Moves a row from a tile register to a zmm destination register, converting +/// the int32 source elements to fp32. The row of the tile is selected by a +/// 32b GPR. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row); +/// \endcode +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL >> 3 +/// row_index := row & 0xffff +/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes +/// FOR i := 0 TO (VL_bytes / 4) - 1 +/// IF i + row_chunk / 4 >= tsrc.colsb / 4 +/// dst.dword[i] := 0 +/// ELSE +/// dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE) +/// FI +/// ENDFOR +/// dst[MAX_VL-1:VL] := 0 +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCVTROWD2PS instruction. +/// +/// \param tsrc +/// The source tile. Max size is 1024 Bytes. +/// \param row +/// The row of the source tile +#define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row) + +/// Moves a row from a tile register to a zmm destination register, converting +/// the fp32 source elements to bf16. It places the resulting bf16 elements +/// in the high 16 bits within each dword. The row of the tile is selected +/// by a 32b GPR. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m512i _tile_cvtrowps2pbf16h(__tile tsrc, unsigned int row); +/// \endcode +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL >> 3 +/// row_index := row & 0xffff +/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes +/// FOR i := 0 TO (VL_bytes / 4) - 1 +/// IF i + row_chunk / 4 >= tsrc.colsb / 4 +/// dst.dword[i] := 0 +/// ELSE +/// dst.word[2*i+0] := 0 +/// dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) +/// FI +/// ENDFOR +/// dst[MAX_VL-1:VL] := 0 +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCVTROWPS2PBF16H instruction. +/// +/// \param tsrc +/// The source tile. Max size is 1024 Bytes. +/// \param row +/// The the row of the source tile. +#define _tile_cvtrowps2pbf16h(tsrc, row) \ + __builtin_ia32_tcvtrowps2pbf16h(tsrc, row) + +/// Moves a row from a tile register to a zmm destination register, converting +/// the fp32 source elements to bf16. It places the resulting bf16 elements +/// in the low 16 bits within each dword. The row of the tile is selected +/// by a 32b GPR. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m512i _tile_cvtrowps2pbf16l(__tile tsrc, unsigned int row); +/// \endcode +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL >> 3 +/// row_index := row & 0xffff +/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes +/// FOR i := 0 TO (VL_bytes / 4) - 1 +/// IF i + row_chunk / 4 >= tsrc.colsb / 4 +/// dst.dword[i] := 0 +/// ELSE +/// dst.word[2*i+1] := 0 +/// dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) +/// FI +/// ENDFOR +/// dst[MAX_VL-1:VL] := 0 +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCVTROWPS2PBF16L instruction. +/// +/// \param tsrc +/// The source tile. Max size is 1024 Bytes. +/// \param row +/// The the row of the source tile. +#define _tile_cvtrowps2pbf16l(tsrc, row) \ + __builtin_ia32_tcvtrowps2pbf16l(tsrc, row) + +/// Moves a row from a tile register to a zmm destination register, converting +/// the fp32 source elements to fp16. It places the resulting fp16 elements +/// in the high 16 bits within each dword. The row of the tile is selected +/// by a 32b GPR. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m512i _tile_cvtrowps2phh(__tile tsrc, unsigned int row); +/// \endcode +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL >> 3 +/// row_index := row & 0xffff +/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes +/// FOR i := 0 TO (VL_bytes / 4) - 1 +/// IF i + row_chunk / 4 >= tsrc.colsb / 4 +/// dst.dword[i] := 0 +/// ELSE +/// dst.word[2*i+0] := 0 +/// dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) +/// FI +/// ENDFOR +/// dst[MAX_VL-1:VL] := 0 +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction. +/// +/// \param tsrc +/// The source tile. Max size is 1024 Bytes. +/// \param row +/// The the row of the source tile. +#define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row) + +/// Moves a row from a tile register to a zmm destination register, converting +/// the fp32 source elements to fp16. It places the resulting fp16 elements +/// in the low 16 bits within each dword. The row of the tile is selected +/// by a 32b GPR. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m512i _tile_cvtrowps2phl(__tile tsrc, unsigned int row); +/// \endcode +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL >> 3 +/// row_index := row & 0xffff +/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes +/// FOR i := 0 TO (VL_bytes / 4) - 1 +/// IF i + row_chunk / 4 >= tsrc.colsb / 4 +/// dst.dword[i] := 0 +/// ELSE +/// dst.word[2*i+1] := 0 +/// dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) +/// FI +/// ENDFOR +/// dst[MAX_VL-1:VL] := 0 +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction. +/// +/// \param tsrc +/// The source tile. Max size is 1024 Bytes. +/// \param row +/// The the row of the source tile. +#define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row) + +/// Move one row of a tile data to a v16f32 data. +/// The row of the tile is selected by a 32b GPR. +/// +/// \headerfile <immintrin.h> +/// +/// \code +/// __m512 _tile_movrow(__tile a, unsigned b); +/// \endcode +/// +/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction. +/// +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source r32. Size is 4 Bytes. +/// \returns +/// The destination v16f32 data. Size is 64 Bytes. +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL>>3 +/// row_index := b&0xffff +/// row_chunk := ((b>>16)&0xffff) * VL_bytes +/// FOR i := 0 TO (VL_bytes-1) +/// IF (row_chunk + i >= a.colsb) +/// dst.byte[i] := 0 +/// ELSE +/// dst.byte[i] := a.row[row_index].byte[row_chunk+i] +/// ENDFOR +/// \endcode +#define _tile_movrow(a, b) __builtin_ia32_tilemovrow(a, b) + +/// This is internal intrinsic. C/C++ user should avoid calling it directly. + +static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal( + unsigned short m, unsigned short n, _tile1024i src, unsigned u) { + return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512 +_tile_cvtrowps2pbf16h_internal(unsigned short m, unsigned short n, + _tile1024i src, unsigned u) { + return __builtin_ia32_tcvtrowps2pbf16h_internal(m, n, src, u); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512 +_tile_cvtrowps2pbf16l_internal(unsigned short m, unsigned short n, + _tile1024i src, unsigned u) { + return __builtin_ia32_tcvtrowps2pbf16l_internal(m, n, src, u); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal( + unsigned short m, unsigned short n, _tile1024i src, unsigned u) { + return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal( + unsigned short m, unsigned short n, _tile1024i src, unsigned u) { + return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_AVX512 _tile_movrow_internal( + unsigned short m, unsigned short n, _tile1024i src, unsigned u) { + return (__m512i)__builtin_ia32_tilemovrow_internal(m, n, src, u); +} + +/// Move a row from a tile (src0) to a v16f32 dst, converting the int32 source +/// elements to fp32. No SIMD exceptions are generated. Rounding is done as if +/// MXCSR.RC=RNE. Embedded rounding is not supported. +/// The row and chunk elements of tile is fetched from 32bit src1. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> TCVTROWD2PS </c> instruction. +/// +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source r32. Size is 4 Bytes. +/// \returns +/// The destination v16f32 data. Size is 64 Bytes. +__DEFAULT_FN_ATTRS_AVX512 +static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) { + return _tile_cvtrowd2ps_internal(src0.row, src0.col, src0.tile, src1); +} + +/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source +/// elements to bf16 at high 16-bits of each dword. +/// The row and chunk elements of tile is fetched from 32bit src1. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> TCVTROWPS2PBF16H </c> instruction. +/// +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source r32. Size is 4 Bytes. +/// \returns +/// The destination v32bf16 data. Size is 64 Bytes. +__DEFAULT_FN_ATTRS_AVX512 +static __m512bh __tile_cvtrowps2pbf16h(__tile1024i src0, unsigned src1) { + return _tile_cvtrowps2pbf16h_internal(src0.row, src0.col, src0.tile, src1); +} + +/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source +/// elements to bf16 at low 16-bits of each dword. +/// The row and chunk elements of tile is fetched from 32bit src1. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> TCVTROWPS2PBF16L </c> instruction. +/// +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source r32. Size is 4 Bytes. +/// \returns +/// The destination v32bf16 data. Size is 64 Bytes. +__DEFAULT_FN_ATTRS_AVX512 +static __m512bh __tile_cvtrowps2pbf16l(__tile1024i src0, unsigned src1) { + return _tile_cvtrowps2pbf16l_internal(src0.row, src0.col, src0.tile, src1); +} + +/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source +/// elements to fp16 at high 16-bits of each dword. +/// The row and chunk elements of tile is fetched from 32bit src1. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> TCVTROWPS2PHH </c> instruction. +/// +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source r32. Size is 4 Bytes. +/// \returns +/// The destination v32fp16 data. Size is 64 Bytes. +__DEFAULT_FN_ATTRS_AVX512 +static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) { + return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1); +} + +/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source +/// elements to fp16 at low 16-bits of each dword. +/// The row and chunk elements of tile is fetched from 32bit src1. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> TCVTROWPS2PHL </c> instruction. +/// +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source r32. Size is 4 Bytes. +/// \returns +/// The destination v32fp16 data. Size is 64 Bytes. +__DEFAULT_FN_ATTRS_AVX512 +static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) { + return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1); +} + +/// Move one row of a tile data to a v16f32 data. +/// The row of the tile is selected by a 32b GPR. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction. +/// +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source r32. Size is 4 Bytes. +/// \returns +/// The destination v16i32 data. Size is 64 Bytes. +__DEFAULT_FN_ATTRS_AVX512 +static __m512i __tile_movrow(__tile1024i src0, unsigned src1) { + return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1); +} + +#endif // __x86_64__ +#endif // __AMX_AVX512INTRIN_H diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 4bf7eac4195eec..bc240e28d59142 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -656,6 +656,10 @@ _storebe_i64(void * __P, long long __D) { #include <amxtransposeintrin.h> #endif +#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_AVX512__) +#include <amxavx512intrin.h> +#endif + #if !defined(__SCE__) || __has_feature(modules) || \ defined(__AVX512VP2INTERSECT__) #include <avx512vp2intersectintrin.h> diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index ef878d16d445fd..1155a5edc73c34 100644 --- a/clang/lib/Sema/SemaX86.cpp +++ b/clang/lib/Sema/SemaX86.cpp @@ -635,6 +635,12 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_t2rpntlvwz0t1: case X86::BI__builtin_ia32_t2rpntlvwz1: case X86::BI__builtin_ia32_t2rpntlvwz1t1: + case X86::BI__builtin_ia32_tcvtrowps2pbf16h: + case X86::BI__builtin_ia32_tcvtrowps2pbf16l: + case X86::BI__builtin_ia32_tcvtrowps2phh: + case X86::BI__builtin_ia32_tcvtrowps2phl: + case X86::BI__builtin_ia32_tcvtrowd2ps: + case X86::BI__builtin_ia32_tilemovrow: return CheckBuiltinTileArgumentsRange(TheCall, 0); case X86::BI__builtin_ia32_tdpbssd: case X86::BI__builtin_ia32_tdpbsud: diff --git a/clang/test/CodeGen/X86/amx_avx512_api.c b/clang/test/CodeGen/X86/amx_avx512_api.c new file mode 100644 index 00000000000000..aea790d61268d3 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_avx512_api.c @@ -0,0 +1,52 @@ +// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-avx512 -target-feature +avx10.2-512 \ +// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK + +#include <immintrin.h> + +char buf[1024]; +#define STRIDE 32 + +char buf2[1024]; + +__m512 test_tile_cvtrowd2ps(__tile1024i a, unsigned b) { + //CHECK-LABEL: @test_tile_cvtrowd2ps + //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) + //CHECK-DAG: call <16 x float> @llvm.x86.tcvtrowd2ps.internal + return __tile_cvtrowd2ps(a, b); +} + +__m512bh test_tile_cvtrowps2pbf16h(__tile1024i a, unsigned b) { + //CHECK-LABEL: @test_tile_cvtrowps2pbf16h + //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) + //CHECK-DAG: call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h.internal + return __tile_cvtrowps2pbf16h(a, b); +} + +__m512bh test_tile_cvtrowps2pbf16l(__tile1024i a, unsigned b) { + //CHECK-LABEL: @test_tile_cvtrowps2pbf16l + //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) + //CHECK-DAG: call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l.internal + return __tile_cvtrowps2pbf16l(a, b); +} + +__m512h test_tile_cvtrowps2phh(__tile1024i a, unsigned b) { + //CHECK-LABEL: @test_tile_cvtrowps2phh + //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) + //CHECK-DAG: call <32 x half> @llvm.x86.tcvtrowps2phh.internal + return __tile_cvtrowps2phh(a, b); +} + +__m512h test_tile_cvtrowps2phl(__tile1024i a, unsigned b) { + //CHECK-LABEL: @test_tile_cvtrowps2phl + //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) + //CHECK-DAG: call <32 x half> @llvm.x86.tcvtrowps2phl.internal + return __tile_cvtrowps2phl(a, b); +} + +__m512i test_tile_movrow(__tile1024i a, unsigned b) { + //CHECK-LABEL: @test_tile_movrow + //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) + //CHECK-DAG: call <16 x i32> @llvm.x86.tilemovrow.internal + return __tile_movrow(a, b); +} diff --git a/clang/test/CodeGen/X86/amxavx512-builtins.c b/clang/test/CodeGen/X86/amxavx512-builtins.c new file mode 100644 index 00000000000000..172b5ae8f53081 --- /dev/null +++ b/clang/test/CodeGen/X86/amxavx512-builtins.c @@ -0,0 +1,41 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-avx512 \ +// RUN: -target-feature +avx10.2-512 -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression | FileCheck %s + +#include <immintrin.h> +#include <stddef.h> + +__m512 test_tile_cvtrowd2ps(unsigned int A) { + // CHECK-LABEL: @test_tile_cvtrowd2ps( + // CHECK: call <16 x float> @llvm.x86.tcvtrowd2ps(i8 1, i32 %{{.*}}) + return _tile_cvtrowd2ps(1, A); +} + +__m512bh test_tile_cvtrowps2pbf16h(unsigned int A) { + // CHECK-LABEL: @test_tile_cvtrowps2pbf16h( + // CHECK: call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h(i8 1, i32 %{{.*}}) + return _tile_cvtrowps2pbf16h(1, A); +} + +__m512bh test_tile_cvtrowps2pbf16l(unsigned int A) { + // CHECK-LABEL: @test_tile_cvtrowps2pbf16l( + // CHECK: call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l(i8 1, i32 %{{.*}}) + return _tile_cvtrowps2pbf16l(1, A); +} + +__m512h test_tile_cvtrowps2phh(unsigned int A) { + // CHECK-LABEL: @test_tile_cvtrowps2phh( + // CHECK: call <32 x half> @llvm.x86.tcvtrowps2phh(i8 1, i32 %{{.*}}) + return _tile_cvtrowps2phh(1, A); +} + +__m512h test_tile_cvtrowps2phl(unsigned int A) { + // CHECK-LABEL: @test_tile_cvtrowps2phl( + // CHECK: call <32 x half> @llvm.x86.tcvtrowps2phl(i8 1, i32 %{{.*}}) + return _tile_cvtrowps2phl(1, A); +} + +__m512i test_tile_movrow(unsigned int A) { + // CHECK-LABEL: @test_tile_movrow + // CHECK: %1 = call <16 x i32> @llvm.x86.tilemovrow(i8 1, i32 %{{.*}}) + return _tile_movrow(1, A); +} diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c index 593ccffbcda095..2033a8b4c335f9 100644 --- a/clang/test/CodeGen/attr-target-x86.c +++ b/clang/test/CodeGen/attr-target-x86.c @@ -59,10 +59,10 @@ void __attribute__((target("avx10.1-512"))) avx10_1_512(void) {} // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87" "tune-cpu"="i686" // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" // CHECK-NOT: tune-cpu -// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-avx,-avx10.1-256,-avx10.1-512,-avx10.2-256,-avx10.2-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" +// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-amx-avx512,-avx,-avx10.1-256,-avx10.1-512,-avx10.2-256,-avx10.2-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686" -// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx10.1-256,-avx10.1-512,-avx10.2-256,-avx10.2-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" -// CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-avx10.1-256,-avx10.1-512,-avx10.2-256,-avx10.2-512,-vaes" +// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-amx-avx512,-avx,-avx10.1-256,-avx10.1-512,-avx10.2-256,-avx10.2-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" +// CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-amx-avx512,-avx10.1-256,-avx10.1-512,-avx10.2-256,-avx10.2-512,-vaes" // CHECK-NOT: tune-cpu // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-mmx" // CHECK: #7 = {{.*}}"target-cpu"="lakemont" "target-features"="+cx8,+mmx" @@ -76,5 +76,5 @@ void __attribute__((target("avx10.1-512"))) avx10_1_512(void) {} // CHECK: "target-cpu"="x86-64-v4" // CHECK-SAME: "target-features"="+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+evex512,+f16c,+fma,+fxsr,+lzcnt,+mmx,+movbe,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" -// CHECK: #12 = {{.*}}"target-cpu"="i686" "target-features"="+aes,+avx,+avx10.1-256,+avx2,+avx512bf16,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512fp16,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vpopcntdq,+cmov,+crc32,+cx8,+f16c,+fma,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+vaes,+vpclmulqdq,+x87,+xsave,-avx10.1-512,-avx10.2-512,-evex512" +// CHECK: #12 = {{.*}}"target-cpu"="i686" "target-features"="+aes,+avx,+avx10.1-256,+avx2,+avx512bf16,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512fp16,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vpopcntdq,+cmov,+crc32,+cx8,+f16c,+fma,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+vaes,+vpclmulqdq,+x87,+xsave,-amx-avx512,-avx10.1-512,-avx10.2-512,-evex512" // CHECK: #13 = {{.*}}"target-cpu"="i686" "target-features"="+aes,+avx,+avx10.1-256,+avx10.1-512,+avx2,+avx512bf16,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512fp16,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vpopcntdq,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+vaes,+vpclmulqdq,+x87,+xsave" diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c index e8c439ab48f21f..822c997f71744f 100644 --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -311,6 +311,13 @@ // AMX-TRANSPOSE: "-target-feature" "+amx-transpose" // NO-AMX-TRANSPOSE: "-target-feature" "-amx-transpose" +// RUN: %clang -target x86_64-unknown-linux-gnu -mamx-avx512 %s \ +// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-AVX512 %s +// RUN: %clang -target x86_64-unknown-linux-gnu -mno-amx-avx512 %s \ +// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-AVX512 %s +// AMX-AVX512: "-target-feature" "+amx-avx512" +// NO-AMX-AVX512: "-target-feature" "-amx-avx512" + // RUN: %clang --target=i386 -march=i386 -mhreset %s -### 2>&1 | FileCheck -check-prefix=HRESET %s // RUN: %clang --target=i386 -march=i386 -mno-hreset %s -### 2>&1 | FileCheck -check-prefix=NO-HRESET %s // HRESET: "-target-feature" "+hreset" diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c index c240b27c91a479..8e4ddb1526626e 100644 --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -558,6 +558,18 @@ // NO-AMX-TRANSPOSE-NOT: #define __AMX_TRANSPOSE__ 1 +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-avx512 -x c \ +// RUN: -E -dM -o - %s | FileCheck -check-prefix=AMX-AVX512 %s + +// AMX-AVX512: #define __AMX_AVX512__ 1 + +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mno-amx-avx512 -x c \ +// RUN: -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-AVX512 %s +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-avx512 -mno-amx-tile \ +// RUN: -x c -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-AVX512 %s + +// NO-AMX-AVX512-NOT: #define __AMX_AVX512__ 1 + // RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNI %s // AVXVNNI: #define __AVX2__ 1 diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index c42397024e45a7..3003f9887e239c 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5952,6 +5952,26 @@ let TargetPrefix = "x86" in { Intrinsic<[], [llvm_i8_ty, llvm_i8_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; + // AMX-AVX512 + def int_x86_tcvtrowd2ps : ClangBuiltin<"__builtin_ia32_tcvtrowd2ps">, + Intrinsic<[llvm_v16f32_ty], [llvm_i8_ty, llvm_i32_ty], + [ImmArg<ArgIndex<0>>]>; + def int_x86_tcvtrowps2pbf16h : ClangBuiltin<"__builtin_ia32_tcvtrowps2pbf16h">, + Intrinsic<[llvm_v32bf16_ty], [llvm_i8_ty, llvm_i32_ty], + [ImmArg<ArgIndex<0>>]>; + def int_x86_tcvtrowps2pbf16l : ClangBuiltin<"__builtin_ia32_tcvtrowps2pbf16l">, + Intrinsic<[llvm_v32bf16_ty], [llvm_i8_ty, llvm_i32_ty], + [ImmArg<ArgIndex<0>>]>; + def int_x86_tcvtrowps2phh : ClangBuiltin<"__builtin_ia32_tcvtrowps2phh">, + Intrinsic<[llvm_v32f16_ty], [llvm_i8_ty, llvm_i32_ty], + [ImmArg<ArgIndex<0>>]>; + def int_x86_tcvtrowps2phl : ClangBuiltin<"__builtin_ia32_tcvtrowps2phl">, + Intrinsic<[llvm_v32f16_ty], [llvm_i8_ty, llvm_i32_ty], + [ImmArg<ArgIndex<0>>]>; + def int_x86_tilemovrow : ClangBuiltin<"__builtin_ia32_tilemovrow">, + Intrinsic<[llvm_v16i32_ty], [llvm_i8_ty, llvm_i32_ty], + [ImmArg<ArgIndex<0>>]>; + // AMX - internal intrinsics def int_x86_ldtilecfg_internal : ClangBuiltin<"__builtin_ia32_tile_loadconfig_internal">, @@ -6050,6 +6070,37 @@ let TargetPrefix = "x86" in { ClangBuiltin<"__builtin_ia32_ttransposed_internal">, Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty], []>; + + def int_x86_tcvtrowd2ps_internal : + ClangBuiltin<"__builtin_ia32_tcvtrowd2ps_internal">, + Intrinsic<[llvm_v16f32_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_i32_ty], + []>; + def int_x86_tcvtrowps2pbf16h_internal : + ClangBuiltin<"__builtin_ia32_tcvtrowps2pbf16h_internal">, + Intrinsic<[llvm_v32bf16_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_i32_ty], + []>; + def int_x86_tcvtrowps2pbf16l_internal : + ClangBuiltin<"__builtin_ia32_tcvtrowps2pbf16l_internal">, + Intrinsic<[llvm_v32bf16_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_i32_ty], + []>; + def int_x86_tcvtrowps2phh_internal : + ClangBuiltin<"__builtin_ia32_tcvtrowps2phh_internal">, + Intrinsic<[llvm_v32f16_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_i32_ty], + []>; + def int_x86_tcvtrowps2phl_internal : + ClangBuiltin<"__builtin_ia32_tcvtrowps2phl_internal">, + Intrinsic<[llvm_v32f16_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_i32_ty], + []>; + def int_x86_tilemovrow_internal : + ClangBuiltin<"__builtin_ia32_tilemovrow_internal">, + Intrinsic<[llvm_v16i32_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_i32_ty], + []>; } //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def index a62b4df420ec6a..815556e374bef5 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -266,6 +266,7 @@ X86_FEATURE (MOVRS, "movrs") X86_FEATURE (ZU, "zu") X86_FEATURE (AMX_FP8, "amx-fp8") X86_FEATURE (AMX_TRANSPOSE, "amx-transpose") +X86_FEATURE (AMX_AVX512, "amx-avx512") // These features aren't really CPU features, but the frontend can set them. X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk") X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches") diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 160e7c0fc0310a..59780ba5b99fcf 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -276,6 +276,10 @@ def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true", def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true", "Support AMX amx-transpose instructions", [FeatureAMXTILE]>; +def FeatureAMXAVX512 : SubtargetFeature<"amx-avx512", + "HasAMXAVX512", "true", + "Support AMX-AVX512 instructions", + [FeatureAMXTILE]>; def FeatureCMPCCXADD : SubtargetFeature<"cmpccxadd", "HasCMPCCXADD", "true", "Support CMPCCXADD instructions">; def FeatureRAOINT : SubtargetFeature<"raoint", "HasRAOINT", "true", diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index f832955d1202fa..9511a82f0e97d2 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -559,12 +559,68 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, return true; } case X86::PTILELOADDV: - case X86::PTILELOADDT1V: { + case X86::PTILELOADDT1V: + case X86::PTCVTROWD2PSrreV: + case X86::PTCVTROWD2PSrriV: + case X86::PTCVTROWPS2PBF16HrreV: + case X86::PTCVTROWPS2PBF16HrriV: + case X86::PTCVTROWPS2PBF16LrreV: + case X86::PTCVTROWPS2PBF16LrriV: + case X86::PTCVTROWPS2PHHrreV: + case X86::PTCVTROWPS2PHHrriV: + case X86::PTCVTROWPS2PHLrreV: + case X86::PTCVTROWPS2PHLrriV: + case X86::PTILEMOVROWrreV: + case X86::PTILEMOVROWrriV: { for (unsigned i = 2; i > 0; --i) MI.removeOperand(i); - unsigned Opc = Opcode == X86::PTILELOADDV - ? GET_EGPR_IF_ENABLED(X86::TILELOADD) - : GET_EGPR_IF_ENABLED(X86::TILELOADDT1); + unsigned Opc; + switch (Opcode) { + case X86::PTILELOADDV: + Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD); + break; + case X86::PTILELOADDT1V: + Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1); + break; + case X86::PTCVTROWD2PSrreV: + Opc = X86::TCVTROWD2PSrre; + break; + case X86::PTCVTROWD2PSrriV: + Opc = X86::TCVTROWD2PSrri; + break; + case X86::PTCVTROWPS2PBF16HrreV: + Opc = X86::TCVTROWPS2PBF16Hrre; + break; + case X86::PTCVTROWPS2PBF16HrriV: + Opc = X86::TCVTROWPS2PBF16Hrri; + break; + case X86::PTCVTROWPS2PBF16LrreV: + Opc = X86::TCVTROWPS2PBF16Lrre; + break; + case X86::PTCVTROWPS2PBF16LrriV: + Opc = X86::TCVTROWPS2PBF16Lrri; + break; + case X86::PTCVTROWPS2PHHrreV: + Opc = X86::TCVTROWPS2PHHrre; + break; + case X86::PTCVTROWPS2PHHrriV: + Opc = X86::TCVTROWPS2PHHrri; + break; + case X86::PTCVTROWPS2PHLrreV: + Opc = X86::TCVTROWPS2PHLrre; + break; + case X86::PTCVTROWPS2PHLrriV: + Opc = X86::TCVTROWPS2PHLrri; + break; + case X86::PTILEMOVROWrreV: + Opc = X86::TILEMOVROWrre; + break; + case X86::PTILEMOVROWrriV: + Opc = X86::TILEMOVROWrri; + break; + default: + llvm_unreachable("Unexpected Opcode"); + } MI.setDesc(TII->get(Opc)); return true; } @@ -714,7 +770,8 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::PTDPBUUDV: Opc = X86::TDPBUUD; break; case X86::PTDPBF16PSV: Opc = X86::TDPBF16PS; break; case X86::PTDPFP16PSV: Opc = X86::TDPFP16PS; break; - default: llvm_unreachable("Impossible Opcode!"); + default: + llvm_unreachable("Unexpected Opcode"); } MI.setDesc(TII->get(Opc)); MI.tieOperands(0, 1); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 91e48f1e77db12..19a85a6d7ec6ce 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37613,6 +37613,82 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo is gone now. return BB; } + case X86::PTCVTROWPS2PBF16Hrri: + case X86::PTCVTROWPS2PBF16Lrri: + case X86::PTCVTROWPS2PHHrri: + case X86::PTCVTROWPS2PHLrri: + case X86::PTCVTROWD2PSrri: + case X86::PTILEMOVROWrri: { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc; + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected instruction!"); + case X86::PTCVTROWD2PSrri: + Opc = X86::TCVTROWD2PSrri; + break; + case X86::PTCVTROWPS2PBF16Hrri: + Opc = X86::TCVTROWPS2PBF16Hrri; + break; + case X86::PTCVTROWPS2PHHrri: + Opc = X86::TCVTROWPS2PHHrri; + break; + case X86::PTCVTROWPS2PBF16Lrri: + Opc = X86::TCVTROWPS2PBF16Lrri; + break; + case X86::PTCVTROWPS2PHLrri: + Opc = X86::TCVTROWPS2PHLrri; + break; + case X86::PTILEMOVROWrri: + Opc = X86::TILEMOVROWrri; + break; + } + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); + MIB.add(MI.getOperand(0)); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); + MIB.addImm(MI.getOperand(2).getImm()); + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } + case X86::PTCVTROWPS2PBF16Hrre: + case X86::PTCVTROWPS2PBF16Lrre: + case X86::PTCVTROWPS2PHHrre: + case X86::PTCVTROWPS2PHLrre: + case X86::PTCVTROWD2PSrre: + case X86::PTILEMOVROWrre: { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc; + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected instruction!"); + case X86::PTCVTROWD2PSrre: + Opc = X86::TCVTROWD2PSrre; + break; + case X86::PTCVTROWPS2PBF16Hrre: + Opc = X86::TCVTROWPS2PBF16Hrre; + break; + case X86::PTCVTROWPS2PBF16Lrre: + Opc = X86::TCVTROWPS2PBF16Lrre; + break; + case X86::PTCVTROWPS2PHHrre: + Opc = X86::TCVTROWPS2PHHrre; + break; + case X86::PTCVTROWPS2PHLrre: + Opc = X86::TCVTROWPS2PHLrre; + break; + case X86::PTILEMOVROWrre: + Opc = X86::TILEMOVROWrre; + break; + } + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); + MIB.add(MI.getOperand(0)); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); + MIB.add(MI.getOperand(2)); + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } } } diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 947a8bec2890ef..b954c977f8c6c9 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -369,3 +369,150 @@ let Predicates = [HasAMXTRANSPOSE, In64BitMode] in { } } } // HasAMXTILE, HasAMXTRANSPOSE + +multiclass m_tcvtrowd2ps { + let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { + let SchedRW = [WriteSystem] in { + def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst), + (ins TILE:$src1, i32u8imm:$src2), + "tcvtrowd2ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, TA,XS, EVEX, EVEX_V512; + def rre : I<0x4A, MRMSrcReg4VOp3, (outs VR512:$dst), + (ins TILE:$src1, GR32:$src2), + "tcvtrowd2ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, T8,XS, EVEX, VVVV, EVEX_V512; + } + } // HasAMXAVX512, HasAVX10_2_512, In64BitMode +} + +defm TCVTROWD2PS : m_tcvtrowd2ps; + +let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { + let SchedRW = [WriteSystem] in { + let usesCustomInserter = 1 in { + def PTCVTROWD2PSrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), + [(set VR512:$dst, (int_x86_tcvtrowd2ps timm:$src1, imm:$src2))]>; + def PTCVTROWD2PSrre : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, GR32:$src2), + [(set VR512:$dst, (int_x86_tcvtrowd2ps timm:$src1, GR32:$src2))]>; + } + + def PTCVTROWD2PSrriV : PseudoI<(outs VR512:$dst), + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), + [(set VR512: $dst, + (int_x86_tcvtrowd2ps_internal GR16:$src1, GR16:$src2, + TILE:$src3, imm:$src4))]>; + def PTCVTROWD2PSrreV : PseudoI<(outs VR512:$dst), + (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), + [(set VR512: $dst, + (int_x86_tcvtrowd2ps_internal GR16:$src1, GR16:$src2, + TILE:$src3, GR32:$src4))]>; + def PTCVTROWPS2PBF16HrriV : PseudoI<(outs VR512:$dst), + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), + [(set VR512: $dst, + (int_x86_tcvtrowps2pbf16h_internal GR16:$src1, GR16:$src2, + TILE:$src3, imm:$src4))]>; + def PTCVTROWPS2PBF16HrreV : PseudoI<(outs VR512:$dst), + (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), + [(set VR512: $dst, + (int_x86_tcvtrowps2pbf16h_internal GR16:$src1, GR16:$src2, + TILE:$src3, GR32:$src4))]>; + def PTCVTROWPS2PBF16LrriV : PseudoI<(outs VR512:$dst), + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), + [(set VR512: $dst, + (int_x86_tcvtrowps2pbf16l_internal GR16:$src1, GR16:$src2, + TILE:$src3, imm:$src4))]>; + def PTCVTROWPS2PBF16LrreV : PseudoI<(outs VR512:$dst), + (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), + [(set VR512: $dst, + (int_x86_tcvtrowps2pbf16l_internal GR16:$src1, GR16:$src2, + TILE:$src3, GR32:$src4))]>; + def PTCVTROWPS2PHHrriV : PseudoI<(outs VR512:$dst), + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), + [(set VR512: $dst, + (int_x86_tcvtrowps2phh_internal GR16:$src1, GR16:$src2, + TILE:$src3, imm:$src4))]>; + def PTCVTROWPS2PHHrreV : PseudoI<(outs VR512:$dst), + (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), + [(set VR512: $dst, + (int_x86_tcvtrowps2phh_internal GR16:$src1, GR16:$src2, + TILE:$src3, GR32:$src4))]>; + def PTCVTROWPS2PHLrriV : PseudoI<(outs VR512:$dst), + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), + [(set VR512: $dst, + (int_x86_tcvtrowps2phl_internal GR16:$src1, GR16:$src2, + TILE:$src3, imm:$src4))]>; + def PTCVTROWPS2PHLrreV : PseudoI<(outs VR512:$dst), + (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), + [(set VR512: $dst, + (int_x86_tcvtrowps2phl_internal GR16:$src1, GR16:$src2, + TILE:$src3, GR32:$src4))]>; + } +} + +multiclass AMXAVX512_BASE<bits<8> Opcode1, bits<8> Opcode2, string Opstr, + Prefix P1, Prefix P2> { + let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode], SchedRW = [WriteSystem] in { + let OpPrefix = P1 in + def rre : I<Opcode1, MRMSrcReg4VOp3, (outs VR512:$dst), + (ins TILE:$src1, GR32:$src2), + !strconcat(Opstr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, EVEX, VVVV, EVEX_V512, T8; + let OpPrefix = P2 in + def rri : Ii8<Opcode2, MRMSrcReg, (outs VR512:$dst), + (ins TILE:$src1, i32u8imm:$src2), + !strconcat(Opstr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, EVEX, EVEX_V512, TA; + let usesCustomInserter = 1 in { + def "P"#NAME#"rre" : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, GR32:$src2), + [(set VR512:$dst, + (!cast<Intrinsic>("int_x86_"#Opstr) timm:$src1, GR32:$src2))]>; + def "P"#NAME#"rri" : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), + [(set VR512:$dst, + (!cast<Intrinsic>("int_x86_"#Opstr) timm:$src1, imm:$src2))]>; + } + } +} + +defm TCVTROWPS2PHH : AMXAVX512_BASE<0x6d, 0x07, "tcvtrowps2phh", PS, PS>; +defm TCVTROWPS2PHL : AMXAVX512_BASE<0x6d, 0x77, "tcvtrowps2phl", PD, XD>; +defm TCVTROWPS2PBF16H : AMXAVX512_BASE<0x6d, 0x07, "tcvtrowps2pbf16h", XD, XD>; +defm TCVTROWPS2PBF16L : AMXAVX512_BASE<0x6d, 0x77, "tcvtrowps2pbf16l", XS, XS>; + +multiclass m_tilemovrow { + let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { + let SchedRW = [WriteSystem] in { + def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst), + (ins TILE:$src1, u8imm:$src2), + "tilemovrow\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, TA,PD, EVEX, EVEX_V512; + def rre : I<0x4A, MRMSrcReg4VOp3, (outs VR512:$dst), + (ins TILE:$src1, GR32:$src2), + "tilemovrow\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, T8,PD, EVEX, VVVV, EVEX_V512; + } + } // HasAMXAVX512, HasAVX10_2_512, In64BitMode +} + +defm TILEMOVROW : m_tilemovrow; + +let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { + let SchedRW = [WriteSystem] in { + let usesCustomInserter = 1 in { + def PTILEMOVROWrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), + [(set VR512:$dst, (int_x86_tilemovrow timm:$src1, imm:$src2))]>; + def PTILEMOVROWrre : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, GR32:$src2), + [(set VR512:$dst, (int_x86_tilemovrow timm:$src1, GR32:$src2))]>; + } + + def PTILEMOVROWrriV : PseudoI<(outs VR512:$dst), + (ins GR16:$src1, GR16:$src2, TILE:$src3, i32u8imm:$src4), + [(set VR512: $dst, + (int_x86_tilemovrow_internal GR16:$src1, GR16:$src2, + TILE:$src3, imm:$src4))]>; + def PTILEMOVROWrreV : PseudoI<(outs VR512:$dst), + (ins GR16:$src1, GR16:$src2, TILE:$src3, GR32:$src4), + [(set VR512: $dst, + (int_x86_tilemovrow_internal GR16:$src1, GR16:$src2, + TILE:$src3, GR32:$src4))]>; + } +} diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index d22e7dadaaa262..2eb4e4fb941b29 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -185,6 +185,7 @@ def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">; def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">; def HasAMXFP8 : Predicate<"Subtarget->hasAMXFP8()">; def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">; +def HasAMXAVX512 : Predicate<"Subtarget->hasAMXAVX512()">; def HasUINTR : Predicate<"Subtarget->hasUINTR()">; def HasUSERMSR : Predicate<"Subtarget->hasUSERMSR()">; def HasCRC32 : Predicate<"Subtarget->hasCRC32()">; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 688e886cf3b13a..af6fb04295bdec 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -266,6 +266,17 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II, Col = getColFromRow(II, II->getArgOperand(0), 4); break; } + case Intrinsic::x86_tcvtrowd2ps_internal: + case Intrinsic::x86_tcvtrowps2pbf16h_internal: + case Intrinsic::x86_tcvtrowps2pbf16l_internal: + case Intrinsic::x86_tcvtrowps2phh_internal: + case Intrinsic::x86_tcvtrowps2phl_internal: + case Intrinsic::x86_tilemovrow_internal: { + assert(OpNo == 2 && "Illegal Operand Number."); + Row = II->getArgOperand(0); + Col = II->getArgOperand(1); + break; + } } return std::make_pair(Row, Col); diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index d20bfdcdb7f9c1..d232a1d706549f 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -118,10 +118,22 @@ class X86PreTileConfig : public MachineFunctionPass { bool isAMXInstruction(MachineInstr &MI) { if (MI.isPHI() || MI.isDebugInstr() || MI.getNumOperands() < 3) return false; - - // PTILESTOREDV is the only exception that doesn't def a AMX register. - if (MI.getOpcode() == X86::PTILESTOREDV) + switch (MI.getOpcode()) { + case X86::PTILESTOREDV: + case X86::PTCVTROWD2PSrreV: + case X86::PTCVTROWD2PSrriV: + case X86::PTCVTROWPS2PBF16HrreV: + case X86::PTCVTROWPS2PBF16HrriV: + case X86::PTCVTROWPS2PBF16LrreV: + case X86::PTCVTROWPS2PBF16LrriV: + case X86::PTCVTROWPS2PHHrreV: + case X86::PTCVTROWPS2PHHrriV: + case X86::PTCVTROWPS2PHLrreV: + case X86::PTCVTROWPS2PHLrriV: + case X86::PTILEMOVROWrreV: + case X86::PTILEMOVROWrriV: return true; + } // We can simply check if it is AMX instruction by its def. // But we should exclude old API which uses physical registers. diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 93911bc51a207d..a973aaaa4806e6 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1880,6 +1880,7 @@ const StringMap<bool> sys::getHostCPUFeatures() { !getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX); Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave; Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave; + Features["amx-avx512"] = HasLeaf1E && ((EAX >> 7) & 1) && HasAMXSave; bool HasLeaf24 = MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX); diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index 691809b6d4b5ad..eb55e6fc9134c8 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -600,6 +600,8 @@ constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE; +constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 = + FeatureAMX_TILE | FeatureAVX10_2_512; constexpr FeatureBitset ImpliedFeaturesHRESET = {}; constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {}; diff --git a/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll b/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll new file mode 100644 index 00000000000000..71f8f231747fe7 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs | FileCheck -check-prefix=O0 %s + +@buf = dso_local global [3072 x i8] zeroinitializer, align 64 + +define internal void @foo() { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: retq +; +; IPRA-LABEL: foo: +; IPRA: # %bb.0: # %entry +; IPRA-NEXT: retq +; +; O0-LABEL: foo: +; O0: # %bb.0: # %entry +; O0-NEXT: retq +entry: + ret void +} + +define dso_local <16 x i32> @test_api(i16 signext %0, i16 signext %1) nounwind { +; CHECK-LABEL: test_api: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $2112, %rsp # imm = 0x840 +; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %zmm0, (%rsp) +; CHECK-NEXT: movb $1, (%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg (%rsp) +; CHECK-NEXT: movl $buf, %eax +; CHECK-NEXT: movl $32, %ecx +; CHECK-NEXT: movw $8, %r14w +; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm0 +; CHECK-NEXT: movabsq $64, %rax +; CHECK-NEXT: tilestored %tmm0, 1088(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: movl $buf+1024, %eax +; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm1 +; CHECK-NEXT: movabsq $64, %rax +; CHECK-NEXT: tilestored %tmm1, 64(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq foo +; CHECK-NEXT: ldtilecfg (%rsp) +; CHECK-NEXT: movabsq $64, %rax +; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm1 # 1024-byte Folded Reload +; CHECK-NEXT: tilemovrow $2, %tmm1, %zmm0 +; CHECK-NEXT: tileloadd 1088(%rsp,%rax), %tmm0 # 1024-byte Folded Reload +; CHECK-NEXT: tilemovrow $2, %tmm0, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: addq $2112, %rsp # imm = 0x840 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: tilerelease +; CHECK-NEXT: retq +; +; IPRA-LABEL: test_api: +; IPRA: # %bb.0: +; IPRA-NEXT: subq $72, %rsp +; IPRA-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; IPRA-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; IPRA-NEXT: movl $buf, %eax +; IPRA-NEXT: movl $32, %ecx +; IPRA-NEXT: movw $8, %dx +; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0 +; IPRA-NEXT: movl $buf+1024, %eax +; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1 +; IPRA-NEXT: callq foo +; IPRA-NEXT: tilemovrow $2, %tmm1, %zmm0 +; IPRA-NEXT: tilemovrow $2, %tmm0, %zmm1 +; IPRA-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; IPRA-NEXT: addq $72, %rsp +; IPRA-NEXT: tilerelease +; IPRA-NEXT: retq +; +; O0-LABEL: test_api: +; O0: # %bb.0: +; O0-NEXT: pushq %rbp +; O0-NEXT: movq %rsp, %rbp +; O0-NEXT: andq $-1024, %rsp # imm = 0xFC00 +; O0-NEXT: subq $4096, %rsp # imm = 0x1000 +; O0-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) +; O0-NEXT: movb $1, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %si, %cx +; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; O0-NEXT: movw %di, %ax +; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; O0-NEXT: movl $buf, %esi +; O0-NEXT: movl $32, %edi +; O0-NEXT: movw $8, %dx +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; O0-NEXT: tileloadd (%rsi,%rdi), %tmm0 +; O0-NEXT: movl $64, %edi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; O0-NEXT: movw $8, %dx +; O0-NEXT: tilestored %tmm0, (%rsi,%rdi) +; O0-NEXT: movl $32, %esi +; O0-NEXT: movl $buf+1024, %edx +; O0-NEXT: movw $8, %ax +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: movw $8, %ax +; O0-NEXT: tilestored %tmm0, (%rdx,%rsi) +; O0-NEXT: vzeroupper +; O0-NEXT: callq foo +; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload +; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload +; O0-NEXT: movl $64, %edi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; O0-NEXT: movw $8, %cx +; O0-NEXT: # implicit-def: $cl +; O0-NEXT: movb %cl, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; O0-NEXT: tileloadd (%rsi,%rdi), %tmm0 +; O0-NEXT: movw $8, %cx +; O0-NEXT: tilemovrow $2, %tmm0, %zmm0 +; O0-NEXT: movl $64, %esi +; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; O0-NEXT: movw $8, %cx +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, {{[0-9]+}}(%rsp) +; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0 +; O0-NEXT: movw $8, %cx +; O0-NEXT: tilemovrow $2, %tmm0, %zmm1 +; O0-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; O0-NEXT: movq %rbp, %rsp +; O0-NEXT: popq %rbp +; O0-NEXT: tilerelease +; O0-NEXT: retq + %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) + %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) + call void @foo() + %5 = call <16 x i32> @llvm.x86.tilemovrow.internal(i16 8, i16 %1, x86_amx %4, i32 2) + %6 = call <16 x i32> @llvm.x86.tilemovrow.internal(i16 %0, i16 8, x86_amx %3, i32 2) + %7 = add <16 x i32> %5, %6 + ret <16 x i32> %7 +} + + +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare <16 x i32> @llvm.x86.tilemovrow.internal(i16, i16, x86_amx, i32) diff --git a/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll b/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll new file mode 100644 index 00000000000000..da7fedee88821b --- /dev/null +++ b/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll @@ -0,0 +1,116 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+amx-tile,+amx-avx512,+avx10.2-512 | FileCheck %s + +define <16 x float> @test_tcvtrowd2ps(i32 %A) { +; CHECK-LABEL: test_tcvtrowd2ps: +; CHECK: # %bb.0: +; CHECK-NEXT: tcvtrowd2ps %edi, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x46,0x48,0x4a,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] + %ret = call <16 x float> @llvm.x86.tcvtrowd2ps(i8 1, i32 %A) + ret <16 x float> %ret +} + +define <16 x float> @test_tcvtrowd2psi() { +; CHECK-LABEL: test_tcvtrowd2psi: +; CHECK: # %bb.0: +; CHECK-NEXT: tcvtrowd2ps $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7e,0x48,0x07,0xc1,0x7f] +; CHECK-NEXT: retq # encoding: [0xc3] + %ret = call <16 x float> @llvm.x86.tcvtrowd2ps(i8 1, i32 127) + ret <16 x float> %ret +} +declare <16 x float> @llvm.x86.tcvtrowd2ps(i8 %A, i32 %B) + +define <32 x bfloat> @test_tcvtrowps2pbf16h(i32 %A) { +; CHECK-LABEL: test_tcvtrowps2pbf16h: +; CHECK: # %bb.0: +; CHECK-NEXT: tcvtrowps2pbf16h %edi, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x47,0x48,0x6d,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] + %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h(i8 1, i32 %A) + ret <32 x bfloat> %ret +} + +define <32 x bfloat> @test_tcvtrowps2pbf16hi() { +; CHECK-LABEL: test_tcvtrowps2pbf16hi: +; CHECK: # %bb.0: +; CHECK-NEXT: tcvtrowps2pbf16h $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x07,0xc1,0x7f] +; CHECK-NEXT: retq # encoding: [0xc3] + %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h(i8 1, i32 127) + ret <32 x bfloat> %ret +} +declare <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h(i8 %A, i32 %B) + +define <32 x bfloat> @test_tcvtrowps2pbf16l(i32 %A) { +; CHECK-LABEL: test_tcvtrowps2pbf16l: +; CHECK: # %bb.0: +; CHECK-NEXT: tcvtrowps2pbf16l %edi, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x46,0x48,0x6d,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] + %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l(i8 1, i32 %A) + ret <32 x bfloat> %ret +} + +define <32 x bfloat> @test_tcvtrowps2pbf16li() { +; CHECK-LABEL: test_tcvtrowps2pbf16li: +; CHECK: # %bb.0: +; CHECK-NEXT: tcvtrowps2pbf16l $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7e,0x48,0x77,0xc1,0x7f] +; CHECK-NEXT: retq # encoding: [0xc3] + %ret = call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l(i8 1, i32 127) + ret <32 x bfloat> %ret +} +declare <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l(i8 %A, i32 %B) + +define <32 x half> @test_tcvtrowps2phh(i32 %A) { +; CHECK-LABEL: test_tcvtrowps2phh: +; CHECK: # %bb.0: +; CHECK-NEXT: tcvtrowps2phh %edi, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x44,0x48,0x6d,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] + %ret = call <32 x half> @llvm.x86.tcvtrowps2phh(i8 1, i32 %A) + ret <32 x half> %ret +} + +define <32 x half> @test_tcvtrowps2phhi() { +; CHECK-LABEL: test_tcvtrowps2phhi: +; CHECK: # %bb.0: +; CHECK-NEXT: tcvtrowps2phh $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7c,0x48,0x07,0xc1,0x7f] +; CHECK-NEXT: retq # encoding: [0xc3] + %ret = call <32 x half> @llvm.x86.tcvtrowps2phh(i8 1, i32 127) + ret <32 x half> %ret +} +declare <32 x half> @llvm.x86.tcvtrowps2phh(i8 %A, i32 %B) + +define <32 x half> @test_tcvtrowps2phl(i32 %A) { +; CHECK-LABEL: test_tcvtrowps2phl: +; CHECK: # %bb.0: +; CHECK-NEXT: tcvtrowps2phl %edi, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x45,0x48,0x6d,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] + %ret = call <32 x half> @llvm.x86.tcvtrowps2phl(i8 1, i32 %A) + ret <32 x half> %ret +} + +define <32 x half> @test_tcvtrowps2phli() { +; CHECK-LABEL: test_tcvtrowps2phli: +; CHECK: # %bb.0: +; CHECK-NEXT: tcvtrowps2phl $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x77,0xc1,0x7f] +; CHECK-NEXT: retq # encoding: [0xc3] + %ret = call <32 x half> @llvm.x86.tcvtrowps2phl(i8 1, i32 127) + ret <32 x half> %ret +} +declare <32 x half> @llvm.x86.tcvtrowps2phl(i8 %A, i32 %B) + +define <16 x i32> @test_tilemovrow(i32 %A) { +; CHECK-LABEL: test_tilemovrow: +; CHECK: # %bb.0: +; CHECK-NEXT: tilemovrow %edi, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x45,0x48,0x4a,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] + %ret = call <16 x i32> @llvm.x86.tilemovrow(i8 1, i32 %A) + ret <16 x i32> %ret +} + +define <16 x i32> @test_tilemovrowi() { +; CHECK-LABEL: test_tilemovrowi: +; CHECK: # %bb.0: +; CHECK-NEXT: tilemovrow $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x07,0xc1,0x7f] +; CHECK-NEXT: retq # encoding: [0xc3] + %ret = call <16 x i32> @llvm.x86.tilemovrow(i8 1, i32 127) + ret <16 x i32> %ret +} +declare <16 x i32> @llvm.x86.tilemovrow(i8 %A, i32 %B) diff --git a/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll b/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll new file mode 100644 index 00000000000000..b4a5c90bbea330 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx10.2-512, \ +; RUN: -mattr=+amx-avx512 -verify-machineinstrs | FileCheck %s + +define void @test_amx(i8* %pointer, i8* %base, i32 %index, i64 %stride) { +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, %ax +; CHECK-NEXT: tileloadd (%rsi,%rcx), %tmm0 +; CHECK-NEXT: tcvtrowd2ps %edx, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowd2ps $16, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2pbf16h %edx, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2pbf16h $16, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2pbf16l %edx, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2pbf16l $16, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2phh %edx, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2phh $16, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2phl %edx, %tmm0, %zmm0 +; CHECK-NEXT: tcvtrowps2phl $16, %tmm0, %zmm0 +; CHECK-NEXT: tilemovrow %edx, %tmm0, %zmm0 +; CHECK-NEXT: tilemovrow $16, %tmm0, %zmm0 +; CHECK-NEXT: tilestored %tmm0, (%rdi,%rcx) +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + + %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) + call <16 x float> @llvm.x86.tcvtrowd2ps.internal(i16 8, i16 8, x86_amx %a, i32 %index) + call <16 x float> @llvm.x86.tcvtrowd2ps.internal(i16 8, i16 8, x86_amx %a, i32 16) + call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h.internal(i16 8, i16 8, x86_amx %a, i32 %index) + call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h.internal(i16 8, i16 8, x86_amx %a, i32 16) + call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l.internal(i16 8, i16 8, x86_amx %a, i32 %index) + call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l.internal(i16 8, i16 8, x86_amx %a, i32 16) + call <32 x half> @llvm.x86.tcvtrowps2phh.internal(i16 8, i16 8, x86_amx %a, i32 %index) + call <32 x half> @llvm.x86.tcvtrowps2phh.internal(i16 8, i16 8, x86_amx %a, i32 16) + call <32 x half> @llvm.x86.tcvtrowps2phl.internal(i16 8, i16 8, x86_amx %a, i32 %index) + call <32 x half> @llvm.x86.tcvtrowps2phl.internal(i16 8, i16 8, x86_amx %a, i32 16) + call <16 x i32> @llvm.x86.tilemovrow.internal(i16 8, i16 8, x86_amx %a, i32 %index) + call <16 x i32> @llvm.x86.tilemovrow.internal(i16 8, i16 8, x86_amx %a, i32 16) + + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %a) + ret void +} + +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64) +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) + +declare <16 x float> @llvm.x86.tcvtrowd2ps.internal(i16, i16, x86_amx, i32) +declare <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h.internal(i16, i16, x86_amx, i32) +declare <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l.internal(i16, i16, x86_amx, i32) +declare <32 x half> @llvm.x86.tcvtrowps2phh.internal(i16, i16, x86_amx, i32) +declare <32 x half> @llvm.x86.tcvtrowps2phl.internal(i16, i16, x86_amx, i32) +declare <16 x i32> @llvm.x86.tilemovrow.internal(i16, i16, x86_amx, i32) diff --git a/llvm/test/MC/Disassembler/X86/amx-avx512.txt b/llvm/test/MC/Disassembler/X86/amx-avx512.txt new file mode 100644 index 00000000000000..0a162af1b4bc02 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/amx-avx512.txt @@ -0,0 +1,106 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: tcvtrowd2ps %ecx, %tmm5, %zmm22 +# INTEL: tcvtrowd2ps zmm22, tmm5, ecx +0x62,0xe2,0x76,0x48,0x4a,0xf5 + +# ATT: tcvtrowd2ps %ecx, %tmm2, %zmm22 +# INTEL: tcvtrowd2ps zmm22, tmm2, ecx +0x62,0xe2,0x76,0x48,0x4a,0xf2 + +# ATT: tcvtrowd2ps $123, %tmm5, %zmm22 +# INTEL: tcvtrowd2ps zmm22, tmm5, 123 +0x62,0xe3,0x7e,0x48,0x07,0xf5,0x7b + +# ATT: tcvtrowd2ps $123, %tmm2, %zmm22 +# INTEL: tcvtrowd2ps zmm22, tmm2, 123 +0x62,0xe3,0x7e,0x48,0x07,0xf2,0x7b + +# ATT: tcvtrowps2pbf16h %ecx, %tmm5, %zmm22 +# INTEL: tcvtrowps2pbf16h zmm22, tmm5, ecx +0x62,0xe2,0x77,0x48,0x6d,0xf5 + +# ATT: tcvtrowps2pbf16h %ecx, %tmm2, %zmm22 +# INTEL: tcvtrowps2pbf16h zmm22, tmm2, ecx +0x62,0xe2,0x77,0x48,0x6d,0xf2 + +# ATT: tcvtrowps2pbf16h $123, %tmm5, %zmm22 +# INTEL: tcvtrowps2pbf16h zmm22, tmm5, 123 +0x62,0xe3,0x7f,0x48,0x07,0xf5,0x7b + +# ATT: tcvtrowps2pbf16h $123, %tmm2, %zmm22 +# INTEL: tcvtrowps2pbf16h zmm22, tmm2, 123 +0x62,0xe3,0x7f,0x48,0x07,0xf2,0x7b + +# ATT: tcvtrowps2pbf16l %ecx, %tmm5, %zmm22 +# INTEL: tcvtrowps2pbf16l zmm22, tmm5, ecx +0x62,0xe2,0x76,0x48,0x6d,0xf5 + +# ATT: tcvtrowps2pbf16l %ecx, %tmm2, %zmm22 +# INTEL: tcvtrowps2pbf16l zmm22, tmm2, ecx +0x62,0xe2,0x76,0x48,0x6d,0xf2 + +# ATT: tcvtrowps2pbf16l $123, %tmm5, %zmm22 +# INTEL: tcvtrowps2pbf16l zmm22, tmm5, 123 +0x62,0xe3,0x7e,0x48,0x77,0xf5,0x7b + +# ATT: tcvtrowps2pbf16l $123, %tmm2, %zmm22 +# INTEL: tcvtrowps2pbf16l zmm22, tmm2, 123 +0x62,0xe3,0x7e,0x48,0x77,0xf2,0x7b + +# ATT: tcvtrowps2phh %ecx, %tmm5, %zmm22 +# INTEL: tcvtrowps2phh zmm22, tmm5, ecx +0x62,0xe2,0x74,0x48,0x6d,0xf5 + +# ATT: tcvtrowps2phh %ecx, %tmm2, %zmm22 +# INTEL: tcvtrowps2phh zmm22, tmm2, ecx +0x62,0xe2,0x74,0x48,0x6d,0xf2 + +# ATT: tcvtrowps2phh $123, %tmm5, %zmm22 +# INTEL: tcvtrowps2phh zmm22, tmm5, 123 +0x62,0xe3,0x7c,0x48,0x07,0xf5,0x7b + +# ATT: tcvtrowps2phh $123, %tmm2, %zmm22 +# INTEL: tcvtrowps2phh zmm22, tmm2, 123 +0x62,0xe3,0x7c,0x48,0x07,0xf2,0x7b + +# ATT: tcvtrowps2phl %ecx, %tmm5, %zmm22 +# INTEL: tcvtrowps2phl zmm22, tmm5, ecx +0x62,0xe2,0x75,0x48,0x6d,0xf5 + +# ATT: tcvtrowps2phl %ecx, %tmm2, %zmm22 +# INTEL: tcvtrowps2phl zmm22, tmm2, ecx +0x62,0xe2,0x75,0x48,0x6d,0xf2 + +# ATT: tcvtrowps2phl $123, %tmm5, %zmm22 +# INTEL: tcvtrowps2phl zmm22, tmm5, 123 +0x62,0xe3,0x7f,0x48,0x77,0xf5,0x7b + +# ATT: tcvtrowps2phl $123, %tmm2, %zmm22 +# INTEL: tcvtrowps2phl zmm22, tmm2, 123 +0x62,0xe3,0x7f,0x48,0x77,0xf2,0x7b + +# ATT: tilemovrow %ecx, %tmm3, %zmm22 +# INTEL: tilemovrow zmm22, tmm3, ecx +0x62,0xe2,0x75,0x48,0x4a,0xf3 + +# ATT: tilemovrow %ecx, %tmm2, %zmm22 +# INTEL: tilemovrow zmm22, tmm2, ecx +0x62,0xe2,0x75,0x48,0x4a,0xf2 + +# ATT: tilemovrow $123, %tmm3, %zmm22 +# INTEL: tilemovrow zmm22, tmm3, 123 +0x62,0xe3,0x7d,0x48,0x07,0xf3,0x7b + +# ATT: tilemovrow $123, %tmm2, %zmm22 +# INTEL: tilemovrow zmm22, tmm2, 123 +0x62,0xe3,0x7d,0x48,0x07,0xf2,0x7b + +# ATT: tilemovrow %edx, %tmm0, %zmm22 +# INTEL: tilemovrow zmm22, tmm0, edx +0x62,0xe2,0x6d,0x48,0x4a,0xf0 + +# ATT: tilemovrow $123, %tmm0, %zmm22 +# INTEL: tilemovrow zmm22, tmm0, 123 +0x62,0xe3,0x7d,0x48,0x07,0xf0,0x7b diff --git a/llvm/test/MC/X86/amx-avx512-att.s b/llvm/test/MC/X86/amx-avx512-att.s new file mode 100644 index 00000000000000..6da4ede82c6217 --- /dev/null +++ b/llvm/test/MC/X86/amx-avx512-att.s @@ -0,0 +1,105 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding < %s | FileCheck %s + +// CHECK: tcvtrowd2ps %ecx, %tmm5, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x76,0x48,0x4a,0xf5] + tcvtrowd2ps %ecx, %tmm5, %zmm22 + +// CHECK: tcvtrowd2ps %ecx, %tmm2, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x76,0x48,0x4a,0xf2] + tcvtrowd2ps %ecx, %tmm2, %zmm22 + +// CHECK: tcvtrowd2ps $123, %tmm5, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7e,0x48,0x07,0xf5,0x7b] + tcvtrowd2ps $123, %tmm5, %zmm22 + +// CHECK: tcvtrowd2ps $123, %tmm2, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7e,0x48,0x07,0xf2,0x7b] + tcvtrowd2ps $123, %tmm2, %zmm22 + +// CHECK: tcvtrowps2pbf16h %ecx, %tmm5, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x77,0x48,0x6d,0xf5] + tcvtrowps2pbf16h %ecx, %tmm5, %zmm22 + +// CHECK: tcvtrowps2pbf16h %ecx, %tmm2, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x77,0x48,0x6d,0xf2] + tcvtrowps2pbf16h %ecx, %tmm2, %zmm22 + +// CHECK: tcvtrowps2pbf16h $123, %tmm5, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x07,0xf5,0x7b] + tcvtrowps2pbf16h $123, %tmm5, %zmm22 + +// CHECK: tcvtrowps2pbf16h $123, %tmm2, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x07,0xf2,0x7b] + tcvtrowps2pbf16h $123, %tmm2, %zmm22 + +// CHECK: tcvtrowps2pbf16l %ecx, %tmm5, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x76,0x48,0x6d,0xf5] + tcvtrowps2pbf16l %ecx, %tmm5, %zmm22 + +// CHECK: tcvtrowps2pbf16l %ecx, %tmm2, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x76,0x48,0x6d,0xf2] + tcvtrowps2pbf16l %ecx, %tmm2, %zmm22 + +// CHECK: tcvtrowps2pbf16l $123, %tmm5, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7e,0x48,0x77,0xf5,0x7b] + tcvtrowps2pbf16l $123, %tmm5, %zmm22 + +// CHECK: tcvtrowps2pbf16l $123, %tmm2, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7e,0x48,0x77,0xf2,0x7b] + tcvtrowps2pbf16l $123, %tmm2, %zmm22 + +// CHECK: tcvtrowps2phh %ecx, %tmm5, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x74,0x48,0x6d,0xf5] + tcvtrowps2phh %ecx, %tmm5, %zmm22 + +// CHECK: tcvtrowps2phh %ecx, %tmm2, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x74,0x48,0x6d,0xf2] + tcvtrowps2phh %ecx, %tmm2, %zmm22 + +// CHECK: tcvtrowps2phh $123, %tmm5, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7c,0x48,0x07,0xf5,0x7b] + tcvtrowps2phh $123, %tmm5, %zmm22 + +// CHECK: tcvtrowps2phh $123, %tmm2, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7c,0x48,0x07,0xf2,0x7b] + tcvtrowps2phh $123, %tmm2, %zmm22 + +// CHECK: tcvtrowps2phl %ecx, %tmm5, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x75,0x48,0x6d,0xf5] + tcvtrowps2phl %ecx, %tmm5, %zmm22 + +// CHECK: tcvtrowps2phl %ecx, %tmm2, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x75,0x48,0x6d,0xf2] + tcvtrowps2phl %ecx, %tmm2, %zmm22 + +// CHECK: tcvtrowps2phl $123, %tmm5, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x77,0xf5,0x7b] + tcvtrowps2phl $123, %tmm5, %zmm22 + +// CHECK: tcvtrowps2phl $123, %tmm2, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x77,0xf2,0x7b] + tcvtrowps2phl $123, %tmm2, %zmm22 + +// CHECK: tilemovrow %ecx, %tmm3, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x75,0x48,0x4a,0xf3] + tilemovrow %ecx, %tmm3, %zmm22 + +// CHECK: tilemovrow %ecx, %tmm2, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x75,0x48,0x4a,0xf2] + tilemovrow %ecx, %tmm2, %zmm22 + +// CHECK: tilemovrow $123, %tmm3, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7d,0x48,0x07,0xf3,0x7b] + tilemovrow $123, %tmm3, %zmm22 + +// CHECK: tilemovrow $123, %tmm2, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7d,0x48,0x07,0xf2,0x7b] + tilemovrow $123, %tmm2, %zmm22 + +// CHECK: tilemovrow %edx, %tmm0, %zmm22 +// CHECK: encoding: [0x62,0xe2,0x6d,0x48,0x4a,0xf0] + tilemovrow %edx, %tmm0, %zmm22 + +// CHECK: tilemovrow $123, %tmm0, %zmm22 +// CHECK: encoding: [0x62,0xe3,0x7d,0x48,0x07,0xf0,0x7b] + tilemovrow $123, %tmm0, %zmm22 diff --git a/llvm/test/MC/X86/amx-avx512-intel.s b/llvm/test/MC/X86/amx-avx512-intel.s new file mode 100644 index 00000000000000..3a517a6cd1aabb --- /dev/null +++ b/llvm/test/MC/X86/amx-avx512-intel.s @@ -0,0 +1,105 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: tcvtrowd2ps zmm22, tmm5, ecx +// CHECK: encoding: [0x62,0xe2,0x76,0x48,0x4a,0xf5] + tcvtrowd2ps zmm22, tmm5, ecx + +// CHECK: tcvtrowd2ps zmm22, tmm2, ecx +// CHECK: encoding: [0x62,0xe2,0x76,0x48,0x4a,0xf2] + tcvtrowd2ps zmm22, tmm2, ecx + +// CHECK: tcvtrowd2ps zmm22, tmm5, 123 +// CHECK: encoding: [0x62,0xe3,0x7e,0x48,0x07,0xf5,0x7b] + tcvtrowd2ps zmm22, tmm5, 123 + +// CHECK: tcvtrowd2ps zmm22, tmm2, 123 +// CHECK: encoding: [0x62,0xe3,0x7e,0x48,0x07,0xf2,0x7b] + tcvtrowd2ps zmm22, tmm2, 123 + +// CHECK: tcvtrowps2pbf16h zmm22, tmm5, ecx +// CHECK: encoding: [0x62,0xe2,0x77,0x48,0x6d,0xf5] + tcvtrowps2pbf16h zmm22, tmm5, ecx + +// CHECK: tcvtrowps2pbf16h zmm22, tmm2, ecx +// CHECK: encoding: [0x62,0xe2,0x77,0x48,0x6d,0xf2] + tcvtrowps2pbf16h zmm22, tmm2, ecx + +// CHECK: tcvtrowps2pbf16h zmm22, tmm5, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x07,0xf5,0x7b] + tcvtrowps2pbf16h zmm22, tmm5, 123 + +// CHECK: tcvtrowps2pbf16h zmm22, tmm2, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x07,0xf2,0x7b] + tcvtrowps2pbf16h zmm22, tmm2, 123 + +// CHECK: tcvtrowps2pbf16l zmm22, tmm5, ecx +// CHECK: encoding: [0x62,0xe2,0x76,0x48,0x6d,0xf5] + tcvtrowps2pbf16l zmm22, tmm5, ecx + +// CHECK: tcvtrowps2pbf16l zmm22, tmm2, ecx +// CHECK: encoding: [0x62,0xe2,0x76,0x48,0x6d,0xf2] + tcvtrowps2pbf16l zmm22, tmm2, ecx + +// CHECK: tcvtrowps2pbf16l zmm22, tmm5, 123 +// CHECK: encoding: [0x62,0xe3,0x7e,0x48,0x77,0xf5,0x7b] + tcvtrowps2pbf16l zmm22, tmm5, 123 + +// CHECK: tcvtrowps2pbf16l zmm22, tmm2, 123 +// CHECK: encoding: [0x62,0xe3,0x7e,0x48,0x77,0xf2,0x7b] + tcvtrowps2pbf16l zmm22, tmm2, 123 + +// CHECK: tcvtrowps2phh zmm22, tmm5, ecx +// CHECK: encoding: [0x62,0xe2,0x74,0x48,0x6d,0xf5] + tcvtrowps2phh zmm22, tmm5, ecx + +// CHECK: tcvtrowps2phh zmm22, tmm2, ecx +// CHECK: encoding: [0x62,0xe2,0x74,0x48,0x6d,0xf2] + tcvtrowps2phh zmm22, tmm2, ecx + +// CHECK: tcvtrowps2phh zmm22, tmm5, 123 +// CHECK: encoding: [0x62,0xe3,0x7c,0x48,0x07,0xf5,0x7b] + tcvtrowps2phh zmm22, tmm5, 123 + +// CHECK: tcvtrowps2phh zmm22, tmm2, 123 +// CHECK: encoding: [0x62,0xe3,0x7c,0x48,0x07,0xf2,0x7b] + tcvtrowps2phh zmm22, tmm2, 123 + +// CHECK: tcvtrowps2phl zmm22, tmm5, ecx +// CHECK: encoding: [0x62,0xe2,0x75,0x48,0x6d,0xf5] + tcvtrowps2phl zmm22, tmm5, ecx + +// CHECK: tcvtrowps2phl zmm22, tmm2, ecx +// CHECK: encoding: [0x62,0xe2,0x75,0x48,0x6d,0xf2] + tcvtrowps2phl zmm22, tmm2, ecx + +// CHECK: tcvtrowps2phl zmm22, tmm5, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x77,0xf5,0x7b] + tcvtrowps2phl zmm22, tmm5, 123 + +// CHECK: tcvtrowps2phl zmm22, tmm2, 123 +// CHECK: encoding: [0x62,0xe3,0x7f,0x48,0x77,0xf2,0x7b] + tcvtrowps2phl zmm22, tmm2, 123 + +// CHECK: tilemovrow zmm22, tmm3, ecx +// CHECK: encoding: [0x62,0xe2,0x75,0x48,0x4a,0xf3] + tilemovrow zmm22, tmm3, ecx + +// CHECK: tilemovrow zmm22, tmm2, ecx +// CHECK: encoding: [0x62,0xe2,0x75,0x48,0x4a,0xf2] + tilemovrow zmm22, tmm2, ecx + +// CHECK: tilemovrow zmm22, tmm3, 123 +// CHECK: encoding: [0x62,0xe3,0x7d,0x48,0x07,0xf3,0x7b] + tilemovrow zmm22, tmm3, 123 + +// CHECK: tilemovrow zmm22, tmm2, 123 +// CHECK: encoding: [0x62,0xe3,0x7d,0x48,0x07,0xf2,0x7b] + tilemovrow zmm22, tmm2, 123 + +// CHECK: tilemovrow zmm22, tmm0, edx +// CHECK: encoding: [0x62,0xe2,0x6d,0x48,0x4a,0xf0] + tilemovrow zmm22, tmm0, edx + +// CHECK: tilemovrow zmm22, tmm0, 123 +// CHECK: encoding: [0x62,0xe3,0x7d,0x48,0x07,0xf0,0x7b] + tilemovrow zmm22, tmm0, 123 >From 44501d8d8a7096decc236fc41f296ae014cec991 Mon Sep 17 00:00:00 2001 From: "Wang, Phoebe" <phoebe.w...@intel.com> Date: Sat, 9 Nov 2024 12:59:57 +0800 Subject: [PATCH 2/2] Resolve compile fail without SSE2 --- clang/lib/Headers/amxavx512intrin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Headers/amxavx512intrin.h b/clang/lib/Headers/amxavx512intrin.h index 945edea543e706..e4d77e503015af 100644 --- a/clang/lib/Headers/amxavx512intrin.h +++ b/clang/lib/Headers/amxavx512intrin.h @@ -12,7 +12,7 @@ #ifndef __AMX_AVX512INTRIN_H #define __AMX_AVX512INTRIN_H -#ifdef __x86_64__ +#if defined(__x86_64__) && defined(__SSE2__) #define __DEFAULT_FN_ATTRS_AVX512 \ __attribute__((__always_inline__, __nodebug__, \ @@ -378,5 +378,5 @@ static __m512i __tile_movrow(__tile1024i src0, unsigned src1) { return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1); } -#endif // __x86_64__ +#endif // __x86_64__ && __SSE2__ #endif // __AMX_AVX512INTRIN_H _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits