https://github.com/MalaySanghi created https://github.com/llvm/llvm-project/pull/115151
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368 >From 3b6510da8fb3b9709839ea0c102355879b11aa6d Mon Sep 17 00:00:00 2001 From: Malay Sanghi <malay.san...@intel.com> Date: Tue, 5 Nov 2024 13:37:54 +0800 Subject: [PATCH 1/2] [X86][AMX] Support AMX-MOVRS Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368 --- clang/include/clang/Basic/BuiltinsX86_64.def | 14 ++ clang/include/clang/Driver/Options.td | 2 + clang/lib/Basic/Targets/X86.cpp | 6 + clang/lib/Basic/Targets/X86.h | 1 + clang/lib/CodeGen/CGBuiltin.cpp | 18 +- clang/lib/Headers/CMakeLists.txt | 1 + clang/lib/Headers/amxmovrsintrin.h | 48 +++++ clang/lib/Headers/amxtransposeintrin.h | 177 ++++++++++++++++++ clang/lib/Headers/immintrin.h | 4 + clang/lib/Sema/SemaX86.cpp | 6 + clang/test/CodeGen/X86/amx_movrs.c | 25 +++ clang/test/CodeGen/X86/amx_movrs_api.c | 34 ++++ clang/test/CodeGen/X86/amx_movrs_errors.c | 14 ++ clang/test/CodeGen/X86/amx_movrs_tranpose.c | 53 ++++++ .../test/CodeGen/X86/amx_movrs_tranpose_api.c | 81 ++++++++ .../CodeGen/X86/amx_movrs_transpose_errors.c | 22 +++ llvm/include/llvm/IR/IntrinsicsX86.td | 48 +++++ llvm/lib/Target/X86/X86.td | 3 + llvm/lib/Target/X86/X86ExpandPseudo.cpp | 35 ++++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 109 ++++++++++- llvm/lib/Target/X86/X86ISelLowering.cpp | 81 ++++++++ llvm/lib/Target/X86/X86InstrAMX.td | 91 +++++++++ llvm/lib/Target/X86/X86InstrInfo.cpp | 1 + llvm/lib/Target/X86/X86InstrPredicates.td | 1 + llvm/lib/Target/X86/X86LowerAMXType.cpp | 8 +- llvm/lib/Target/X86/X86RegisterInfo.cpp | 10 +- llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll | 108 +++++++++++ .../X86/amx_movrs_transpose_intrinsics.ll | 92 +++++++++ .../Disassembler/X86/AMX/x86-64-amx-movrs.txt | 98 ++++++++++ llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s | 89 +++++++++ llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s | 97 ++++++++++ 31 files changed, 1371 insertions(+), 6 deletions(-) create mode 100644 clang/lib/Headers/amxmovrsintrin.h create mode 100755 clang/test/CodeGen/X86/amx_movrs.c create mode 100755 clang/test/CodeGen/X86/amx_movrs_api.c create mode 100755 clang/test/CodeGen/X86/amx_movrs_errors.c create mode 100755 clang/test/CodeGen/X86/amx_movrs_tranpose.c create mode 100755 clang/test/CodeGen/X86/amx_movrs_tranpose_api.c create mode 100755 clang/test/CodeGen/X86/amx_movrs_transpose_errors.c create mode 100755 llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll create mode 100755 llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll create mode 100755 llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt create mode 100755 llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s create mode 100755 llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index d95e8455a304b6..98235023bddc7b 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -117,7 +117,9 @@ TARGET_BUILTIN(__builtin_ia32_uwrmsr, "vULLiULLi", "n", "usermsr") // AMX internal builtin TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tileloaddrs64_internal, "V256iUsUsvC*z", "n", "amx-movrs") TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tileloaddrst164_internal, "V256iUsUsvC*z", "n", "amx-movrs") TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") @@ -129,15 +131,27 @@ TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose") + // AMX TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose") + +TARGET_BUILTIN(__builtin_ia32_tileloaddrs64, "vIUcvC*z", "n", "amx-movrs") +TARGET_BUILTIN(__builtin_ia32_tileloaddrst164, "vIUcvC*z", "n", "amx-movrs") TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 805b79491e6ea4..c36adb673dd4ea 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6303,6 +6303,8 @@ def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>; def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>; def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>; def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>; +def mamx_movrs: Flag<["-"], "mamx-movrs">, Group<m_x86_Features_Group>; +def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group<m_x86_Features_Group>; def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>; def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group<m_x86_Features_Group>; def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index d7d3adef42c79a..0ddc1ac4c47f22 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -430,6 +430,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features, HasAMXCOMPLEX = true; } else if (Feature == "+amx-fp8") { HasAMXFP8 = true; + } else if (Feature == "+amx-movrs") { + HasAMXMOVRS = true; } else if (Feature == "+amx-transpose") { HasAMXTRANSPOSE = true; } else if (Feature == "+cmpccxadd") { @@ -953,6 +955,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AMX_COMPLEX__"); if (HasAMXFP8) Builder.defineMacro("__AMX_FP8__"); + if (HasAMXMOVRS) + Builder.defineMacro("__AMX_MOVRS__"); if (HasAMXTRANSPOSE) Builder.defineMacro("__AMX_TRANSPOSE__"); if (HasCMPCCXADD) @@ -1085,6 +1089,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("amx-fp16", true) .Case("amx-fp8", true) .Case("amx-int8", true) + .Case("amx-movrs", true) .Case("amx-tile", true) .Case("amx-transpose", true) .Case("avx", true) @@ -1205,6 +1210,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("amx-fp16", HasAMXFP16) .Case("amx-fp8", HasAMXFP8) .Case("amx-int8", HasAMXINT8) + .Case("amx-movrs", HasAMXMOVRS) .Case("amx-tile", HasAMXTILE) .Case("amx-transpose", HasAMXTRANSPOSE) .Case("avx", SSELevel >= AVX) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index e2eba63b992355..54a078d2f137b6 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -158,6 +158,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAMXBF16 = false; bool HasAMXCOMPLEX = false; bool HasAMXFP8 = false; + bool HasAMXMOVRS = false; bool HasAMXTRANSPOSE = false; bool HasSERIALIZE = false; bool HasTSXLDTRK = false; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 34fedd67114751..02ee0132bbb5eb 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -16996,9 +16996,13 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, } // Corresponding to intrisics which will return 2 tiles (tile0_tile1). case X86::BI__builtin_ia32_t2rpntlvwz0_internal: + case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: case X86::BI__builtin_ia32_t2rpntlvwz1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: { + case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: { Intrinsic::ID IID; switch (BuiltinID) { default: @@ -17006,15 +17010,27 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_t2rpntlvwz0_internal: IID = Intrinsic::x86_t2rpntlvwz0_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: + IID = Intrinsic::x86_t2rpntlvwz0rs_internal; + break; case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: IID = Intrinsic::x86_t2rpntlvwz0t1_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: + IID = Intrinsic::x86_t2rpntlvwz0rst1_internal; + break; case X86::BI__builtin_ia32_t2rpntlvwz1_internal: IID = Intrinsic::x86_t2rpntlvwz1_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: + IID = Intrinsic::x86_t2rpntlvwz1rs_internal; + break; case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: IID = Intrinsic::x86_t2rpntlvwz1t1_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: + IID = Intrinsic::x86_t2rpntlvwz1rst1_internal; + break; } // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride) diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 67242cd4d981bc..a50cf01eac6fef 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -151,6 +151,7 @@ set(x86_files amxfp8intrin.h amxintrin.h amxtransposeintrin.h + amxmovrsintrin.h avx10_2_512bf16intrin.h avx10_2_512convertintrin.h avx10_2_512minmaxintrin.h diff --git a/clang/lib/Headers/amxmovrsintrin.h b/clang/lib/Headers/amxmovrsintrin.h new file mode 100644 index 00000000000000..5fe2fdecb8b5dd --- /dev/null +++ b/clang/lib/Headers/amxmovrsintrin.h @@ -0,0 +1,48 @@ +/*===-------- amxmovrsintrin.h - AMX MOVRS intrinsics -*- C++ -*---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * ===-------------------------------------------------------------------=== */ + +#ifndef __IMMINTRIN_H +#error "Never use <amxmovrsintrin.h> directly; include <immintrin.h> instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMXMOVRSINTRIN_H +#define __AMXMOVRSINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_MOVRS \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-movrs"))) + +#define _tile_loaddrs(dst, base, stride) \ + __builtin_ia32_tileloaddrs64((dst), ((const void *)(base)), \ + (__SIZE_TYPE__)(stride)) +#define _tile_stream_loaddrs(dst, base, stride) \ + __builtin_ia32_tileloaddrst164((dst), ((const void *)(base)), \ + (__SIZE_TYPE__)(stride)) +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS +_tile_loaddrs_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloaddrs64_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS +_tile_loaddrst1_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloaddrst164_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ void __DEFAULT_FN_ATTRS_MOVRS +__tile_loaddrs(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) { + dst->tile = _tile_loaddrs_internal(dst->row, dst->col, base, stride); +} +static __inline__ void __DEFAULT_FN_ATTRS_MOVRS __tile_stream_loaddrs( + __tile1024i *dst, const void *base, __SIZE_TYPE__ stride) { + dst->tile = _tile_loaddrst1_internal(dst->row, dst->col, base, stride); +} +#undef __DEFAULT_FN_ATTRS_MOVRS +#endif /* __x86_64__ */ +#endif /* __AMXMOVRSINTRIN_H */ diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h index b3fa37d766c45b..086c9a75222ca1 100644 --- a/clang/lib/Headers/amxtransposeintrin.h +++ b/clang/lib/Headers/amxtransposeintrin.h @@ -17,6 +17,9 @@ #define __DEFAULT_FN_ATTRS_TRANSPOSE \ __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose"))) +#define __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-transpose,amx-movrs"))) #define _tile_2rpntlvwz0(tdst, base, stride) \ __builtin_ia32_t2rpntlvwz0(tdst, base, stride) @@ -26,6 +29,15 @@ __builtin_ia32_t2rpntlvwz1(tdst, base, stride) #define _tile_2rpntlvwz1t1(tdst, base, stride) \ __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride) +// MOVRS versions +#define _tile_2rpntlvwz0rs(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride) +#define _tile_2rpntlvwz0rst1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride) +#define _tile_2rpntlvwz1rs(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride) +#define _tile_2rpntlvwz1rst1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride) /// Transpose 32-bit elements from \a src and write the result to \a dst. /// @@ -101,6 +113,45 @@ _tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) { return __builtin_ia32_ttransposed_internal(m, n, src); } +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +_tile_2rpntlvwz0rs_internal(unsigned short row, unsigned short col0, + unsigned short col1, _tile1024i *dst0, + _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + // Use __tile1024i_1024a* to escape the alignment check in + // clang/test/Headers/x86-intrinsics-headers-clean.cpp + __builtin_ia32_t2rpntlvwz0rs_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +_tile_2rpntlvwz0rst1_internal(unsigned short row, unsigned short col0, + unsigned short col1, _tile1024i *dst0, + _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz0rst1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +_tile_2rpntlvwz1rs_internal(unsigned short row, unsigned short col0, + unsigned short col1, _tile1024i *dst0, + _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1rs_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +_tile_2rpntlvwz1rst1_internal(unsigned short row, unsigned short col0, + unsigned short col1, _tile1024i *dst0, + _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1rst1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + /// Converts a pair of tiles from memory into VNNI format, and places the /// results in a pair of destinations specified by dst. The pair of tiles /// in memory is specified via a tsib; the second tile is after the first @@ -229,6 +280,131 @@ static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1, &dst1->tile, base, stride); } +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + /// Transpose 32-bit elements from src and write the result to dst. /// /// \headerfile <immintrin.h> @@ -244,5 +420,6 @@ static void __tile_transposed(__tile1024i *dst, __tile1024i src) { dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile); } +#undef __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS #endif /* __x86_64__ */ #endif /* __AMX_TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 4bf7eac4195eec..5035f02d889e72 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -656,6 +656,10 @@ _storebe_i64(void * __P, long long __D) { #include <amxtransposeintrin.h> #endif +#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_MOVRS__) +#include <amxmovrsintrin.h> +#endif + #if !defined(__SCE__) || __has_feature(modules) || \ defined(__AVX512VP2INTERSECT__) #include <avx512vp2intersectintrin.h> diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index ef878d16d445fd..4d3b0292a56a98 100644 --- a/clang/lib/Sema/SemaX86.cpp +++ b/clang/lib/Sema/SemaX86.cpp @@ -629,12 +629,18 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { return false; case X86::BI__builtin_ia32_tileloadd64: case X86::BI__builtin_ia32_tileloaddt164: + case X86::BI__builtin_ia32_tileloaddrs64: + case X86::BI__builtin_ia32_tileloaddrst164: case X86::BI__builtin_ia32_tilestored64: case X86::BI__builtin_ia32_tilezero: case X86::BI__builtin_ia32_t2rpntlvwz0: case X86::BI__builtin_ia32_t2rpntlvwz0t1: case X86::BI__builtin_ia32_t2rpntlvwz1: case X86::BI__builtin_ia32_t2rpntlvwz1t1: + case X86::BI__builtin_ia32_t2rpntlvwz0rst1: + case X86::BI__builtin_ia32_t2rpntlvwz1rs: + case X86::BI__builtin_ia32_t2rpntlvwz1rst1: + case X86::BI__builtin_ia32_t2rpntlvwz0rs: return CheckBuiltinTileArgumentsRange(TheCall, 0); case X86::BI__builtin_ia32_tdpbssd: case X86::BI__builtin_ia32_tdpbsud: diff --git a/clang/test/CodeGen/X86/amx_movrs.c b/clang/test/CodeGen/X86/amx_movrs.c new file mode 100755 index 00000000000000..4a8f001baafcea --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs.c @@ -0,0 +1,25 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-movrs -target-feature +avx512f \ +// RUN: -emit-llvm -o - -Wall -Werror -pedantic \ +// RUN: -Wno-gnu-statement-expression| FileCheck %s + +#include <immintrin.h> +#include <stddef.h> + +#define STRIDE 32 + +char buf[1024]; + +void test_tile_loadd(short row, short col) { + // CHECK-LABEL: define dso_local void @test_tile_loadd( + // CHECK: call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) + // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) + _tile_loaddrs_internal(row, col, buf, STRIDE); +} + +void test_tile_loaddt1(short row, short col) { + // CHECK-LABEL: define dso_local void @test_tile_loaddt1( + // CHECK: call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) + // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) + _tile_loaddrst1_internal(row, col, buf, STRIDE); +} diff --git a/clang/test/CodeGen/X86/amx_movrs_api.c b/clang/test/CodeGen/X86/amx_movrs_api.c new file mode 100755 index 00000000000000..cf430adf140852 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs_api.c @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \ +// RUN: -Wno-gnu-statement-expression| FileCheck %s + +#include <immintrin.h> +#include <stddef.h> + +#define STRIDE 32 + +char buf[1024]; + +void test_tile_loadd(short row) { + // CHECK-LABEL: define dso_local void @test_tile_loadd( + // CHECK: call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) + // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) + __tile1024i a = {row, 8}; + __tile_loaddrs(&a, buf, STRIDE); +} + +void test_tile_loaddt1(short row) { + // CHECK-LABEL: define dso_local void @test_tile_loaddt1( + // CHECK: call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) + // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) + __tile1024i a = {row, 8}; + __tile_stream_loaddrs(&a, buf, STRIDE); +} + +void test_tile_loadd_macro(void *data) { + // CHECK-LABEL: define dso_local void @test_tile_loadd_macro( + // CHECK: call void @llvm.x86.tileloaddrs64(i8 {{.*}}, ptr %{{.*}}, i64 {{.*}}) + // CHECK: call void @llvm.x86.tileloaddrst164(i8 {{.*}}, ptr %{{.*}}, i64 {{.*}}) + _tile_loaddrs(4, data, STRIDE); + _tile_stream_loaddrs(2, data, STRIDE); +} diff --git a/clang/test/CodeGen/X86/amx_movrs_errors.c b/clang/test/CodeGen/X86/amx_movrs_errors.c new file mode 100755 index 00000000000000..bac7d962f5cb5c --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs_errors.c @@ -0,0 +1,14 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-reduce -target-feature +amx-memory \ +// RUN: -target-feature +amx-format -target-feature +amx-element -emit-llvm -verify + +#include <immintrin.h> +#include <stddef.h> + +char buf[1024]; + +void test_tile_load() { + _tile_loaddrs(20, buf, 32); // expected-error {{argument value 20 is outside the valid range [0, 7]}} + _tile_stream_loaddrs(-1, buf, 20); // expected-error {{argument value 255 is outside the valid range [0, 7]}} +} diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose.c b/clang/test/CodeGen/X86/amx_movrs_tranpose.c new file mode 100755 index 00000000000000..192c153835e1e6 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs_tranpose.c @@ -0,0 +1,53 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \ +// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s + +#include <immintrin.h> +#include <stddef.h> + +char buf[2048]; +#define STRIDE 32 + +// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz0rs_internal( +// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +void test_tile_2rpntlvwz0rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { + _tile_2rpntlvwz0rs_internal(row, col0, col1, D0, D1, B, 1); +} + +// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz0rst1_internal( +// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +void test_tile_2rpntlvwz0rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { + _tile_2rpntlvwz0rst1_internal(row, col0, col1, D0, D1, B, 1); +} + +// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz1rs_internal( +// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +void test_tile_2rpntlvwz1rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { + _tile_2rpntlvwz1rs_internal(row, col0, col1, D0, D1, B, 1); +} + +// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz1rst1_internal( +// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}}) +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024 +// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1 +// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}}) +void test_tile_2rpntlvwz1rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) { + _tile_2rpntlvwz1rst1_internal(row, col0, col1, D0, D1, B, 1); +} diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c new file mode 100755 index 00000000000000..b174cc5067bf30 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c @@ -0,0 +1,81 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \ +// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s + +#include <immintrin.h> +#include <stddef.h> + +char buf[2048]; +#define STRIDE 32 + +void test_tile_2rpntlvwz0rs(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz0rs + // CHECK: call void @llvm.x86.t2rpntlvwz0rs(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz0rs(1, A, B); +} + +void test_tile_2rpntlvwz0rst1(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz0rst1 + // CHECK: call void @llvm.x86.t2rpntlvwz0rst1(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz0rst1(1, A, B); +} + +void test_tile_2rpntlvwz1rs(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz1rs + // CHECK: call void @llvm.x86.t2rpntlvwz1rs(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz1rs(1, A, B); +} + +void test_tile_2rpntlvwz1rst1(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz1rst1 + // CHECK: call void @llvm.x86.t2rpntlvwz1rst1(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz1rst1(1, A, B); +} + +void test__tile_2rpntlvwz0rs(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test__tile_2rpntlvwz0rs + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz0rs(&dst0, &dst1, buf, STRIDE); +} + +void test__tile_2rpntlvwz0rst1(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test__tile_2rpntlvwz0rst1 + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz0rst1(&dst0, &dst1, buf, STRIDE); +} + +void test__tile_2rpntlvwz1rs(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test__tile_2rpntlvwz1rs + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz1rs(&dst0, &dst1, buf, STRIDE); +} + +void test__tile_2rpntlvwz1rst1(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test__tile_2rpntlvwz1rst1 + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz1rst1(&dst0, &dst1, buf, STRIDE); +} diff --git a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c new file mode 100755 index 00000000000000..c8846b36ffa874 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-int8 -target-feature +amx-transpose -target-feature +amx-movrs \ +// RUN: -emit-llvm -verify + +#include <immintrin.h> +#include <stddef.h> + +void test_tile_2rpntlvwz0rs(const void *A, size_t B) { + _tile_2rpntlvwz0rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_2rpntlvwz0rst1(const void *A, size_t B) { + _tile_2rpntlvwz0rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_2rpntlvwz1rs(const void *A, size_t B) { + _tile_2rpntlvwz1rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_2rpntlvwz1rst1(const void *A, size_t B) { + _tile_2rpntlvwz1rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index c42397024e45a7..d37bda13c3c4ac 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5882,6 +5882,12 @@ let TargetPrefix = "x86" in { def int_x86_tilestored64 : ClangBuiltin<"__builtin_ia32_tilestored64">, Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], [ImmArg<ArgIndex<0>>]>; + def int_x86_tileloaddrs64 : ClangBuiltin<"__builtin_ia32_tileloaddrs64">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg<ArgIndex<0>>]>; + def int_x86_tileloaddrst164 : ClangBuiltin<"__builtin_ia32_tileloaddrst164">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg<ArgIndex<0>>]>; def int_x86_tdpbssd : ClangBuiltin<"__builtin_ia32_tdpbssd">, Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, @@ -5952,6 +5958,20 @@ let TargetPrefix = "x86" in { Intrinsic<[], [llvm_i8_ty, llvm_i8_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; + // AMX-MORVS, AMX-TRANSPOSE + def int_x86_t2rpntlvwz0rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rs">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg<ArgIndex<0>>]>; + def int_x86_t2rpntlvwz0rst1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rst1">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg<ArgIndex<0>>]>; + def int_x86_t2rpntlvwz1rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1rs">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg<ArgIndex<0>>]>; + def int_x86_t2rpntlvwz1rst1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1rst1">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg<ArgIndex<0>>]>; + // AMX - internal intrinsics def int_x86_ldtilecfg_internal : ClangBuiltin<"__builtin_ia32_tile_loadconfig_internal">, @@ -5966,6 +5986,16 @@ let TargetPrefix = "x86" in { Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], []>; + def int_x86_tileloaddrs64_internal : + ClangBuiltin<"__builtin_ia32_tileloaddrs64_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; + def int_x86_tileloaddrst164_internal : + ClangBuiltin<"__builtin_ia32_tileloaddrst164_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; def int_x86_tdpbssd_internal : ClangBuiltin<"__builtin_ia32_tdpbssd_internal">, Intrinsic<[llvm_x86amx_ty], @@ -6030,6 +6060,24 @@ let TargetPrefix = "x86" in { llvm_x86amx_ty, llvm_x86amx_ty, llvm_x86amx_ty], []>; + // AMX-MORVS, AMX-TRANSPOSE - internal intrinsics + def int_x86_t2rpntlvwz0rs_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrArgMemOnly, IntrReadMem]>; + def int_x86_t2rpntlvwz0rst1_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrArgMemOnly]>; + def int_x86_t2rpntlvwz1rs_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrArgMemOnly]>; + def int_x86_t2rpntlvwz1rst1_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrArgMemOnly]>; + def int_x86_t2rpntlvwz0_internal : Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 160e7c0fc0310a..dfeffae6dec4f5 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -273,6 +273,9 @@ def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true", def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true", "Support AMX-FP8 instructions", [FeatureAMXTILE]>; +def FeatureAMXMOVRS : SubtargetFeature<"amx-movrs", "HasAMXMOVRS", "true", + "Support AMX-MOVRS instructions", + [FeatureAMXTILE]>; def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true", "Support AMX amx-transpose instructions", [FeatureAMXTILE]>; diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index f832955d1202fa..94072502c829a4 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -558,6 +558,15 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.setDesc(TII->get(GET_EGPR_IF_ENABLED(X86::LDTILECFG))); return true; } + case X86::PTILELOADDRSV: + case X86::PTILELOADDRST1V: { + for (unsigned i = 2; i > 0; --i) + MI.removeOperand(i); + unsigned Opc = + Opcode == X86::PTILELOADDRSV ? X86::TILELOADDRS : X86::TILELOADDRST1; + MI.setDesc(TII->get(Opc)); + return true; + } case X86::PTILELOADDV: case X86::PTILELOADDT1V: { for (unsigned i = 2; i > 0; --i) @@ -687,6 +696,32 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.setDesc(TII->get(Opc)); return true; } + case X86::PT2RPNTLVWZ0RSV: + case X86::PT2RPNTLVWZ0RST1V: + case X86::PT2RPNTLVWZ1RSV: + case X86::PT2RPNTLVWZ1RST1V: { + for (unsigned i = 3; i > 0; --i) + MI.removeOperand(i); + unsigned Opc; + switch (Opcode) { + case X86::PT2RPNTLVWZ0RSV: + Opc = X86::T2RPNTLVWZ0RS; + break; + case X86::PT2RPNTLVWZ0RST1V: + Opc = X86::T2RPNTLVWZ0RST1; + break; + case X86::PT2RPNTLVWZ1RSV: + Opc = X86::T2RPNTLVWZ1RS; + break; + case X86::PT2RPNTLVWZ1RST1V: + Opc = X86::T2RPNTLVWZ1RST1; + break; + default: + llvm_unreachable("Impossible Opcode!"); + } + MI.setDesc(TII->get(Opc)); + return true; + } case X86::PTTRANSPOSEDV: { for (int i = 2; i > 0; --i) MI.removeOperand(i); diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index aea86c280e2f99..b2b18c1cf45576 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -338,6 +338,10 @@ namespace { case X86::PT2RPNTLVWZ0T1V: case X86::PT2RPNTLVWZ1V: case X86::PT2RPNTLVWZ1T1V: + case X86::PT2RPNTLVWZ0RSV: + case X86::PT2RPNTLVWZ0RST1V: + case X86::PT2RPNTLVWZ1RSV: + case X86::PT2RPNTLVWZ1RST1V: return true; } for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) { @@ -5189,6 +5193,33 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, CNode); return; } + case Intrinsic::x86_tileloaddrs64_internal: + case Intrinsic::x86_tileloaddrst164_internal: { + if (!Subtarget->hasAMXMOVRS()) + break; + unsigned Opc = IntNo == Intrinsic::x86_tileloaddrs64_internal + ? X86::PTILELOADDRSV + : X86::PTILELOADDRST1V; + // _tile_loadd_internal(row, col, buf, STRIDE) + SDValue Base = Node->getOperand(4); + SDValue Scale = getI8Imm(1, dl); + SDValue Index = Node->getOperand(5); + SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue Chain = Node->getOperand(0); + MachineSDNode *CNode; + SDValue Ops[] = {Node->getOperand(2), + Node->getOperand(3), + Base, + Scale, + Index, + Disp, + Segment, + Chain}; + CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); + ReplaceNode(Node, CNode); + return; + } } break; } @@ -5307,6 +5338,44 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, CNode); return; } + case Intrinsic::x86_tileloaddrs64: + case Intrinsic::x86_tileloaddrst164: { + if (!Subtarget->hasAMXMOVRS()) + break; + auto *MFI = + CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); + MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); + unsigned Opc; + switch (IntNo) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_tileloaddrs64: + Opc = X86::PTILELOADDRS; + break; + case Intrinsic::x86_tileloaddrst164: + Opc = X86::PTILELOADDRST1; + break; + } + // FIXME: Match displacement and scale. + unsigned TIndex = Node->getConstantOperandVal(2); + SDValue TReg = getI8Imm(TIndex, dl); + SDValue Base = Node->getOperand(3); + SDValue Scale = getI8Imm(1, dl); + SDValue Index = Node->getOperand(4); + SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue Chain = Node->getOperand(0); + MachineSDNode *CNode; + if (Opc == X86::PTILESTORED) { + SDValue Ops[] = {Base, Scale, Index, Disp, Segment, TReg, Chain}; + CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + } else { + SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; + CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + } + ReplaceNode(Node, CNode); + return; + } case Intrinsic::x86_t2rpntlvwz0: case Intrinsic::x86_t2rpntlvwz0t1: case Intrinsic::x86_t2rpntlvwz1: @@ -5342,9 +5411,45 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); SDValue Chain = Node->getOperand(0); - MachineSDNode *CNode; SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; - CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + ReplaceNode(Node, CNode); + return; + } + case Intrinsic::x86_t2rpntlvwz0rs: + case Intrinsic::x86_t2rpntlvwz0rst1: + case Intrinsic::x86_t2rpntlvwz1rs: + case Intrinsic::x86_t2rpntlvwz1rst1: { + if (!Subtarget->hasAMXTRANSPOSE() || !Subtarget->hasAMXMOVRS()) + break; + unsigned Opc; + switch (IntNo) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_t2rpntlvwz0rs: + Opc = X86::PT2RPNTLVWZ0RS; + break; + case Intrinsic::x86_t2rpntlvwz0rst1: + Opc = X86::PT2RPNTLVWZ0RST1; + break; + case Intrinsic::x86_t2rpntlvwz1rs: + Opc = X86::PT2RPNTLVWZ1RS; + break; + case Intrinsic::x86_t2rpntlvwz1rst1: + Opc = X86::PT2RPNTLVWZ1RST1; + break; + } + // FIXME: Match displacement and scale. + unsigned TIndex = Node->getConstantOperandVal(2); + SDValue TReg = getI8Imm(TIndex, dl); + SDValue Base = Node->getOperand(3); + SDValue Scale = getI8Imm(1, dl); + SDValue Index = Node->getOperand(4); + SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue Chain = Node->getOperand(0); + SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; + MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); ReplaceNode(Node, CNode); return; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0ae814d0ca3bb4..e9a3b0b675564f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27291,6 +27291,13 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } + case Intrinsic::x86_t2rpntlvwz0rs_internal: + case Intrinsic::x86_t2rpntlvwz0rst1_internal: + case Intrinsic::x86_t2rpntlvwz1rs_internal: + case Intrinsic::x86_t2rpntlvwz1rst1_internal: + if (!Subtarget.hasAMXTRANSPOSE() || !Subtarget.hasAMXMOVRS()) + break; + [[fallthrough]]; case Intrinsic::x86_t2rpntlvwz0_internal: case Intrinsic::x86_t2rpntlvwz0t1_internal: case Intrinsic::x86_t2rpntlvwz1_internal: @@ -27316,6 +27323,18 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, case Intrinsic::x86_t2rpntlvwz1t1_internal: Opc = X86::PT2RPNTLVWZ1T1V; break; + case Intrinsic::x86_t2rpntlvwz0rs_internal: + Opc = X86::PT2RPNTLVWZ0RSV; + break; + case Intrinsic::x86_t2rpntlvwz0rst1_internal: + Opc = X86::PT2RPNTLVWZ0RST1V; + break; + case Intrinsic::x86_t2rpntlvwz1rs_internal: + Opc = X86::PT2RPNTLVWZ1RSV; + break; + case Intrinsic::x86_t2rpntlvwz1rst1_internal: + Opc = X86::PT2RPNTLVWZ1RST1V; + break; } SDLoc DL(Op); @@ -37553,6 +37572,35 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo is gone now. return BB; } + case X86::PTILELOADDRS: + case X86::PTILELOADDRST1: { + unsigned Opc; + switch (MI.getOpcode()) { + default: + llvm_unreachable("illegal opcode!"); + case X86::PTILELOADDRS: + Opc = X86::TILELOADDRS; + break; + case X86::PTILELOADDRST1: + Opc = X86::TILELOADDRST1; + break; + } + MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc)); + unsigned CurOp = 0; + if (Opc != X86::TILESTORED) + MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), + RegState::Define); + MIB.add(MI.getOperand(CurOp++)); // base + MIB.add(MI.getOperand(CurOp++)); // scale + MIB.add(MI.getOperand(CurOp++)); // index -- stride + MIB.add(MI.getOperand(CurOp++)); // displacement + MIB.add(MI.getOperand(CurOp++)); // segment + if (Opc == X86::TILESTORED) + MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), + RegState::Undef); + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } case X86::PTCMMIMFP16PS: case X86::PTCMMRLFP16PS: { const MIMetadata MIMD(MI); @@ -37605,6 +37653,39 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo is gone now. return BB; } + case X86::PT2RPNTLVWZ0RS: + case X86::PT2RPNTLVWZ0RST1: + case X86::PT2RPNTLVWZ1RS: + case X86::PT2RPNTLVWZ1RST1: { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc; + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected instruction!"); + case X86::PT2RPNTLVWZ0RS: + Opc = X86::T2RPNTLVWZ0RS; + break; + case X86::PT2RPNTLVWZ0RST1: + Opc = X86::T2RPNTLVWZ0RST1; + break; + case X86::PT2RPNTLVWZ1RS: + Opc = X86::T2RPNTLVWZ1RS; + break; + case X86::PT2RPNTLVWZ1RST1: + Opc = X86::T2RPNTLVWZ1RST1; + break; + } + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); + MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define); + + MIB.add(MI.getOperand(1)); // base + MIB.add(MI.getOperand(2)); // scale + MIB.add(MI.getOperand(3)); // index + MIB.add(MI.getOperand(4)); // displacement + MIB.add(MI.getOperand(5)); // segment + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } case X86::PTTRANSPOSED: { const DebugLoc &DL = MI.getDebugLoc(); diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 947a8bec2890ef..efd396cd2bc439 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -369,3 +369,94 @@ let Predicates = [HasAMXTRANSPOSE, In64BitMode] in { } } } // HasAMXTILE, HasAMXTRANSPOSE + +let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { + def T2RPNTLVWZ0RS : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}", + []>, VEX, T_MAP5; + def T2RPNTLVWZ0RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}", + []>, VEX, T_MAP5; + def T2RPNTLVWZ1RS : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}", + []>, VEX, T_MAP5, PD; + def T2RPNTLVWZ1RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}", + []>, VEX, T_MAP5, PD; + let isPseudo = true in { + def PT2RPNTLVWZ0RSV : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + def PT2RPNTLVWZ0RST1V : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + def PT2RPNTLVWZ1RSV : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + def PT2RPNTLVWZ1RST1V : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + } + let usesCustomInserter = 1 in { + def PT2RPNTLVWZ0RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; + def PT2RPNTLVWZ0RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; + def PT2RPNTLVWZ1RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; + def PT2RPNTLVWZ1RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>; + } +} // HasAMXMOVRS, HasAMXTRANSPOSE + +let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in { + def TILELOADDRS : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), + (ins sibmem:$src1), + "tileloaddrs\t{$src1, $dst|$dst, $src1}", + []>, VEX, T8, XD; + def TILELOADDRST1 : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), + (ins sibmem:$src1), + "tileloaddrst1\t{$src1, $dst|$dst, $src1}", + []>, VEX, T8, PD; + + let isPseudo = true, mayLoad = 1 in + def PTILELOADDRSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, + GR16:$src2, + opaquemem:$src3), []>; + let isPseudo = true, mayLoad = 1 in + def PTILELOADDRST1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, + GR16:$src2, + opaquemem:$src3), []>; + let usesCustomInserter = 1 in { + let mayLoad = 1 in + def PTILELOADDRS : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; + let mayLoad = 1 in + def PTILELOADDRST1 : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; + } + + def TILELOADDRSrm_EVEX : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), + (ins sibmem:$src1), + "tileloaddrs\t{$src1, $dst|$dst, $src1}", + []>, EVEX, NoCD8, T8, XD; + def TILELOADDRST1rm_EVEX : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), + (ins sibmem:$src1), + "tileloaddrst1\t{$src1, $dst|$dst, $src1}", + []>, EVEX, NoCD8, T8, PD; + + def T2RPNTLVWZ0RS_EVEX : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}", + []>, EVEX, NoCD8, T_MAP5; + def T2RPNTLVWZ0RST1_EVEX : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}", + []>, EVEX, NoCD8, T_MAP5; + def T2RPNTLVWZ1RS_EVEX : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}", + []>, EVEX, NoCD8, T_MAP5, PD; + def T2RPNTLVWZ1RST1_EVEX : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src1), + "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}", + []>, EVEX, NoCD8, T_MAP5, PD; +} // HasAMXMOVRS, In64BitMode diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 9b002ebd3a93bc..41bece5e2cec4e 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4738,6 +4738,7 @@ static bool isAMXOpcode(unsigned Opc) { case X86::TILELOADD_EVEX: case X86::TILESTORED_EVEX: case X86::PTILEPAIRLOAD: + case X86::TILELOADDRS: return true; } } diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index d22e7dadaaa262..7a31e4212670b9 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -184,6 +184,7 @@ def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">; def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">; def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">; def HasAMXFP8 : Predicate<"Subtarget->hasAMXFP8()">; +def HasAMXMOVRS : Predicate<"Subtarget->hasAMXMOVRS()">; def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">; def HasUINTR : Predicate<"Subtarget->hasUINTR()">; def HasUSERMSR : Predicate<"Subtarget->hasUSERMSR()">; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 688e886cf3b13a..c5c0f7a03b33a8 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -229,7 +229,13 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II, case Intrinsic::x86_t2rpntlvwz1t1_internal: case Intrinsic::x86_tileloadd64_internal: case Intrinsic::x86_tileloaddt164_internal: - case Intrinsic::x86_tilestored64_internal: { + case Intrinsic::x86_tilestored64_internal: + case Intrinsic::x86_t2rpntlvwz0rs_internal: + case Intrinsic::x86_t2rpntlvwz0rst1_internal: + case Intrinsic::x86_t2rpntlvwz1rs_internal: + case Intrinsic::x86_t2rpntlvwz1rst1_internal: + case Intrinsic::x86_tileloaddrs64_internal: + case Intrinsic::x86_tileloaddrst164_internal: { Row = II->getArgOperand(0); Col = II->getArgOperand(1); break; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 2daaa95b06be0d..6c7fc4bd49e80d 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -1078,7 +1078,9 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, case X86::PTDPFP16PSV: case X86::PTCMMIMFP16PSV: case X86::PTCMMRLFP16PSV: - case X86::PTTRANSPOSEDV: { + case X86::PTTRANSPOSEDV: + case X86::PTILELOADDRSV: + case X86::PTILELOADDRST1V: { MachineOperand &MO1 = MI->getOperand(1); MachineOperand &MO2 = MI->getOperand(2); ShapeT Shape(&MO1, &MO2, MRI); @@ -1088,7 +1090,11 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, case X86::PT2RPNTLVWZ0V: case X86::PT2RPNTLVWZ0T1V: case X86::PT2RPNTLVWZ1V: - case X86::PT2RPNTLVWZ1T1V: { + case X86::PT2RPNTLVWZ1T1V: + case X86::PT2RPNTLVWZ0RSV: + case X86::PT2RPNTLVWZ0RST1V: + case X86::PT2RPNTLVWZ1RSV: + case X86::PT2RPNTLVWZ1RST1V: { MachineOperand &MO1 = MI->getOperand(1); MachineOperand &MO2 = MI->getOperand(2); MachineOperand &MO3 = MI->getOperand(3); diff --git a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll new file mode 100755 index 00000000000000..da212a1850964e --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs | FileCheck %s + +define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) { +; CHECK-LABEL: test_amx_internal: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 +; CHECK-NEXT: subq $3072, %rsp # imm = 0xC00 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; CHECK-NEXT: movw %ax, %cx +; CHECK-NEXT: movw %di, %ax +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: tileloaddrs (%rdx,%rsi), %tmm0 +; CHECK-NEXT: movl $64, %esi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: tilerelease +; CHECK-NEXT: retq +entry: + %t1 = call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %m, i16 %n, ptr %buf, i64 %s) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + ret void +} +declare x86_amx @llvm.x86.tileloaddrs64.internal(i16, i16, ptr, i64) + +define void @test_amx_old(i16 %m, i16 %n, ptr %buf) { +; CHECK-LABEL: test_amx_old: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl $32, %eax +; CHECK-NEXT: tileloaddrs (%rdx,%rax), %tmm2 +; CHECK-NEXT: retq +entry: + call void @llvm.x86.tileloaddrs64(i8 2, ptr %buf, i64 32) + ret void +} +declare void @llvm.x86.tileloaddrs64(i8 immarg, ptr, i64) + +define void @test_amx_t1_internal(i16 %m, i16 %n, ptr %buf, i64 %s) { +; CHECK-LABEL: test_amx_t1_internal: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 +; CHECK-NEXT: subq $3072, %rsp # imm = 0xC00 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; CHECK-NEXT: movw %ax, %cx +; CHECK-NEXT: movw %di, %ax +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: tileloaddrst1 (%rdx,%rsi), %tmm0 +; CHECK-NEXT: movl $64, %esi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: tilerelease +; CHECK-NEXT: retq +entry: + %t1 = call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %m, i16 %n, ptr %buf, i64 %s) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + ret void +} +declare x86_amx @llvm.x86.tileloaddrst164.internal(i16, i16, ptr, i64) + +define void @test_amx_t1_old(i16 %m, i16 %n, ptr %buf) { +; CHECK-LABEL: test_amx_t1_old: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl $32, %eax +; CHECK-NEXT: tileloaddrst1 (%rdx,%rax), %tmm2 +; CHECK-NEXT: retq +entry: + call void @llvm.x86.tileloaddrst164(i8 2, ptr %buf, i64 32) + ret void +} +declare void @llvm.x86.tileloaddrst164(i8 immarg, ptr, i64) diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll new file mode 100755 index 00000000000000..146b69773eb186 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0 +; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2 + +define void @test_amx(i64 %stride, i8* %addr1) #0 { +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0 +; CHECK-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 +; CHECK-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 +; CHECK-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 +; CHECK-NEXT: retq + call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride) + call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride) + call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride) + call void @llvm.x86.t2rpntlvwz1rst1(i8 2, i8* %addr1, i64 %stride) + ret void +} +declare void @llvm.x86.t2rpntlvwz0rs(i8 , i8* , i64 ) +declare void @llvm.x86.t2rpntlvwz0rst1(i8 , i8* , i64 ) +declare void @llvm.x86.t2rpntlvwz1rs(i8 , i8* , i64 ) +declare void @llvm.x86.t2rpntlvwz1rst1(i8 , i8* , i64 ) + +define void @test_amx2(i8* %base, i64 %stride) #0 { +; O0-LABEL: test_amx2: +; O0: # %bb.0: +; O0-NEXT: xorps %xmm0, %xmm0 +; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O0-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw $8, %ax +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; O0-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 +; O0-NEXT: movw $8, %ax +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; O0-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 +; O0-NEXT: movw $8, %ax +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: # implicit-def: $al +; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; O0-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 +; O0-NEXT: tilerelease +; O0-NEXT: retq +; +; O2-LABEL: test_amx2: +; O2: # %bb.0: +; O2-NEXT: xorps %xmm0, %xmm0 +; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; O2-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; O2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; O2-NEXT: movw $8, %ax +; O2-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4 +; O2-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 +; O2-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 +; O2-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 +; O2-NEXT: tilerelease +; O2-NEXT: retq + call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + ret void +} +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16, i16, i16, i8*, i64) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16, i16, i16, i8*, i64) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16, i16, i16, i8*, i64) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16, i16, i16, i8*, i64) diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt new file mode 100755 index 00000000000000..6df44c87d2332f --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt @@ -0,0 +1,98 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s -check-prefix=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s -check-prefix=INTEL + +# ATT: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6 +# INTEL: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291] +0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0rs 64(%rbx), %tmm6 +# INTEL: t2rpntlvwz0rs tmm6, [rbx + 64] +0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40 + +# ATT: t2rpntlvwz0rs -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32] +0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6 +# INTEL: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291] +0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0rst1 64(%rbx), %tmm6 +# INTEL: t2rpntlvwz0rst1 tmm6, [rbx + 64] +0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40 + +# ATT: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32] +0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6 +# INTEL: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291] +0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1rs 64(%rbx), %tmm6 +# INTEL: t2rpntlvwz1rs tmm6, [rbx + 64] +0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40 + +# ATT: t2rpntlvwz1rs -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32] +0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6 +# INTEL: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291] +0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1rst1 64(%rbx), %tmm6 +# INTEL: t2rpntlvwz1rst1 tmm6, [rbx + 64] +0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40 + +# ATT: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32] +0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: tileloaddrs 268435456(%rbp,%r14,8), %tmm6 +# INTEL: tileloaddrs tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: tileloaddrs 291(%r8,%rax,4), %tmm3 +# INTEL: tileloaddrs tmm3, [r8 + 4*rax + 291] +0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00 + +# ATT: tileloaddrs 64(%rbx), %tmm6 +# INTEL: tileloaddrs tmm6, [rbx + 64] +0xc4,0xe2,0x7b,0x4a,0x74,0x23,0x40 + +# ATT: tileloaddrs -32(,%rbp,2), %tmm3 +# INTEL: tileloaddrs tmm3, [2*rbp - 32] +0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff + +# ATT: tileloaddrst1 268435456(%rbp,%r14,8), %tmm6 +# INTEL: tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: tileloaddrst1 291(%r8,%rax,4), %tmm3 +# INTEL: tileloaddrst1 tmm3, [r8 + 4*rax + 291] +0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00 + +# ATT: tileloaddrst1 64(%rbx), %tmm6 +# INTEL: tileloaddrst1 tmm6, [rbx + 64] +0xc4,0xe2,0x79,0x4a,0x74,0x23,0x40 + +# ATT: tileloaddrst1 -32(,%rbp,2), %tmm3 +# INTEL: tileloaddrst1 tmm3, [2*rbp - 32] +0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s new file mode 100755 index 00000000000000..d780ad4f0e3691 --- /dev/null +++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s @@ -0,0 +1,89 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s + +// CHECK: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz0rs 64(%rbx), %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40] + t2rpntlvwz0rs 64(%rbx), %tmm6 + +// CHECK: t2rpntlvwz0rs -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0rs -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz0rst1 64(%rbx), %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40] + t2rpntlvwz0rst1 64(%rbx), %tmm6 + +// CHECK: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz1rs 64(%rbx), %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40] + t2rpntlvwz1rs 64(%rbx), %tmm6 + +// CHECK: t2rpntlvwz1rs -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1rs -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz1rst1 64(%rbx), %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40] + t2rpntlvwz1rst1 64(%rbx), %tmm6 + +// CHECK: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 + +// CHECK: tileloaddrs 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] + tileloaddrs 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: tileloaddrs 291(%r8,%rax,4), %tmm3 +// CHECK: encoding: [0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrs 291(%r8,%rax,4), %tmm3 + +// CHECK: tileloaddrs -32(,%rbp,2), %tmm3 +// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + tileloaddrs -32(,%rbp,2), %tmm3 + +// CHECK: tileloaddrst1 268435456(%rbp,%r14,8), %tmm6 +// CHECK: encoding: [0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] + tileloaddrst1 268435456(%rbp,%r14,8), %tmm6 + +// CHECK: tileloaddrst1 291(%r8,%rax,4), %tmm3 +// CHECK: encoding: [0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrst1 291(%r8,%rax,4), %tmm3 + +// CHECK: tileloaddrst1 -32(,%rbp,2), %tmm3 +// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + tileloaddrst1 -32(,%rbp,2), %tmm3 \ No newline at end of file diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s new file mode 100755 index 00000000000000..ccc7ac51a98a44 --- /dev/null +++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s @@ -0,0 +1,97 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz0rs tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40] + t2rpntlvwz0rs tmm6, [rbx + 64] + +// CHECK: t2rpntlvwz0rs tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0rs tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz0rst1 tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40] + t2rpntlvwz0rst1 tmm6, [rbx + 64] + +// CHECK: t2rpntlvwz0rst1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0rst1 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz1rs tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40] + t2rpntlvwz1rs tmm6, [rbx + 64] + +// CHECK: t2rpntlvwz1rs tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1rs tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz1rst1 tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40] + t2rpntlvwz1rst1 tmm6, [rbx + 64] + +// CHECK: t2rpntlvwz1rst1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1rst1 tmm2, [2*rbp - 32] + +// CHECK: tileloaddrs tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] + tileloaddrs tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: tileloaddrs tmm3, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrs tmm3, [r8 + 4*rax + 291] + +// CHECK: tileloaddrs tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x74,0x23,0x40] + tileloaddrs tmm6, [rbx + 64] + +// CHECK: tileloaddrs tmm3, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + tileloaddrs tmm3, [2*rbp - 32] + +// CHECK: tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10] + tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: tileloaddrst1 tmm3, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrst1 tmm3, [r8 + 4*rax + 291] + +// CHECK: tileloaddrst1 tmm6, [rbx + 64] +// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x74,0x23,0x40] + tileloaddrst1 tmm6, [rbx + 64] + +// CHECK: tileloaddrst1 tmm3, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + tileloaddrst1 tmm3, [2*rbp - 32] >From e7a09d7ccbbcd0ed222cdbc57236d2158306457e Mon Sep 17 00:00:00 2001 From: Malay Sanghi <malay.san...@intel.com> Date: Wed, 6 Nov 2024 18:16:51 +0800 Subject: [PATCH 2/2] update test --- clang/test/CodeGen/X86/amx_movrs_errors.c | 2 +- clang/test/CodeGen/X86/amx_movrs_transpose_errors.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/CodeGen/X86/amx_movrs_errors.c b/clang/test/CodeGen/X86/amx_movrs_errors.c index bac7d962f5cb5c..2790126eb8672b 100755 --- a/clang/test/CodeGen/X86/amx_movrs_errors.c +++ b/clang/test/CodeGen/X86/amx_movrs_errors.c @@ -1,7 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ // RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-reduce -target-feature +amx-memory \ -// RUN: -target-feature +amx-format -target-feature +amx-element -emit-llvm -verify +// RUN: -target-feature +amx-format -target-feature +amx-element -verify #include <immintrin.h> #include <stddef.h> diff --git a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c index c8846b36ffa874..840b52bbb29bbf 100755 --- a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c +++ b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c @@ -1,6 +1,6 @@ // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ // RUN: -target-feature +amx-int8 -target-feature +amx-transpose -target-feature +amx-movrs \ -// RUN: -emit-llvm -verify +// RUN: -verify #include <immintrin.h> #include <stddef.h> _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits