llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-clang Author: Malay Sanghi (MalaySanghi) <details> <summary>Changes</summary> Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368 --- Patch is 81.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115151.diff 31 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsX86_64.def (+14) - (modified) clang/include/clang/Driver/Options.td (+2) - (modified) clang/lib/Basic/Targets/X86.cpp (+6) - (modified) clang/lib/Basic/Targets/X86.h (+1) - (modified) clang/lib/CodeGen/CGBuiltin.cpp (+17-1) - (modified) clang/lib/Headers/CMakeLists.txt (+1) - (added) clang/lib/Headers/amxmovrsintrin.h (+48) - (modified) clang/lib/Headers/amxtransposeintrin.h (+177) - (modified) clang/lib/Headers/immintrin.h (+4) - (modified) clang/lib/Sema/SemaX86.cpp (+6) - (added) clang/test/CodeGen/X86/amx_movrs.c (+25) - (added) clang/test/CodeGen/X86/amx_movrs_api.c (+34) - (added) clang/test/CodeGen/X86/amx_movrs_errors.c (+14) - (added) clang/test/CodeGen/X86/amx_movrs_tranpose.c (+53) - (added) clang/test/CodeGen/X86/amx_movrs_tranpose_api.c (+81) - (added) clang/test/CodeGen/X86/amx_movrs_transpose_errors.c (+22) - (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+48) - (modified) llvm/lib/Target/X86/X86.td (+3) - (modified) llvm/lib/Target/X86/X86ExpandPseudo.cpp (+35) - (modified) llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (+107-2) - (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+81) - (modified) llvm/lib/Target/X86/X86InstrAMX.td (+91) - (modified) llvm/lib/Target/X86/X86InstrInfo.cpp (+1) - (modified) llvm/lib/Target/X86/X86InstrPredicates.td (+1) - (modified) llvm/lib/Target/X86/X86LowerAMXType.cpp (+7-1) - (modified) llvm/lib/Target/X86/X86RegisterInfo.cpp (+8-2) - (added) llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll (+108) - (added) llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll (+92) - (added) llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt (+98) - (added) llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s (+89) - (added) llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s (+97) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index d95e8455a304b6..98235023bddc7b 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -117,7 +117,9 @@ TARGET_BUILTIN(__builtin_ia32_uwrmsr, "vULLiULLi", "n", "usermsr") // AMX internal builtin TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tileloaddrs64_internal, "V256iUsUsvC*z", "n", "amx-movrs") TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tileloaddrst164_internal, "V256iUsUsvC*z", "n", "amx-movrs") TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") @@ -129,15 +131,27 @@ TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose") TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose") + // AMX TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose") + +TARGET_BUILTIN(__builtin_ia32_tileloaddrs64, "vIUcvC*z", "n", "amx-movrs") +TARGET_BUILTIN(__builtin_ia32_tileloaddrst164, "vIUcvC*z", "n", "amx-movrs") TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 805b79491e6ea4..c36adb673dd4ea 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6303,6 +6303,8 @@ def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>; def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>; def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>; def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>; +def mamx_movrs: Flag<["-"], "mamx-movrs">, Group<m_x86_Features_Group>; +def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group<m_x86_Features_Group>; def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>; def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group<m_x86_Features_Group>; def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index d7d3adef42c79a..0ddc1ac4c47f22 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -430,6 +430,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features, HasAMXCOMPLEX = true; } else if (Feature == "+amx-fp8") { HasAMXFP8 = true; + } else if (Feature == "+amx-movrs") { + HasAMXMOVRS = true; } else if (Feature == "+amx-transpose") { HasAMXTRANSPOSE = true; } else if (Feature == "+cmpccxadd") { @@ -953,6 +955,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AMX_COMPLEX__"); if (HasAMXFP8) Builder.defineMacro("__AMX_FP8__"); + if (HasAMXMOVRS) + Builder.defineMacro("__AMX_MOVRS__"); if (HasAMXTRANSPOSE) Builder.defineMacro("__AMX_TRANSPOSE__"); if (HasCMPCCXADD) @@ -1085,6 +1089,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("amx-fp16", true) .Case("amx-fp8", true) .Case("amx-int8", true) + .Case("amx-movrs", true) .Case("amx-tile", true) .Case("amx-transpose", true) .Case("avx", true) @@ -1205,6 +1210,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("amx-fp16", HasAMXFP16) .Case("amx-fp8", HasAMXFP8) .Case("amx-int8", HasAMXINT8) + .Case("amx-movrs", HasAMXMOVRS) .Case("amx-tile", HasAMXTILE) .Case("amx-transpose", HasAMXTRANSPOSE) .Case("avx", SSELevel >= AVX) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index e2eba63b992355..54a078d2f137b6 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -158,6 +158,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAMXBF16 = false; bool HasAMXCOMPLEX = false; bool HasAMXFP8 = false; + bool HasAMXMOVRS = false; bool HasAMXTRANSPOSE = false; bool HasSERIALIZE = false; bool HasTSXLDTRK = false; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 34fedd67114751..02ee0132bbb5eb 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -16996,9 +16996,13 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, } // Corresponding to intrisics which will return 2 tiles (tile0_tile1). case X86::BI__builtin_ia32_t2rpntlvwz0_internal: + case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: case X86::BI__builtin_ia32_t2rpntlvwz1_internal: - case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: { + case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: { Intrinsic::ID IID; switch (BuiltinID) { default: @@ -17006,15 +17010,27 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_t2rpntlvwz0_internal: IID = Intrinsic::x86_t2rpntlvwz0_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal: + IID = Intrinsic::x86_t2rpntlvwz0rs_internal; + break; case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: IID = Intrinsic::x86_t2rpntlvwz0t1_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal: + IID = Intrinsic::x86_t2rpntlvwz0rst1_internal; + break; case X86::BI__builtin_ia32_t2rpntlvwz1_internal: IID = Intrinsic::x86_t2rpntlvwz1_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal: + IID = Intrinsic::x86_t2rpntlvwz1rs_internal; + break; case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: IID = Intrinsic::x86_t2rpntlvwz1t1_internal; break; + case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: + IID = Intrinsic::x86_t2rpntlvwz1rst1_internal; + break; } // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride) diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 67242cd4d981bc..a50cf01eac6fef 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -151,6 +151,7 @@ set(x86_files amxfp8intrin.h amxintrin.h amxtransposeintrin.h + amxmovrsintrin.h avx10_2_512bf16intrin.h avx10_2_512convertintrin.h avx10_2_512minmaxintrin.h diff --git a/clang/lib/Headers/amxmovrsintrin.h b/clang/lib/Headers/amxmovrsintrin.h new file mode 100644 index 00000000000000..5fe2fdecb8b5dd --- /dev/null +++ b/clang/lib/Headers/amxmovrsintrin.h @@ -0,0 +1,48 @@ +/*===-------- amxmovrsintrin.h - AMX MOVRS intrinsics -*- C++ -*---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * ===-------------------------------------------------------------------=== */ + +#ifndef __IMMINTRIN_H +#error "Never use <amxmovrsintrin.h> directly; include <immintrin.h> instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMXMOVRSINTRIN_H +#define __AMXMOVRSINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_MOVRS \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-movrs"))) + +#define _tile_loaddrs(dst, base, stride) \ + __builtin_ia32_tileloaddrs64((dst), ((const void *)(base)), \ + (__SIZE_TYPE__)(stride)) +#define _tile_stream_loaddrs(dst, base, stride) \ + __builtin_ia32_tileloaddrst164((dst), ((const void *)(base)), \ + (__SIZE_TYPE__)(stride)) +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS +_tile_loaddrs_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloaddrs64_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS +_tile_loaddrst1_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloaddrst164_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ void __DEFAULT_FN_ATTRS_MOVRS +__tile_loaddrs(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) { + dst->tile = _tile_loaddrs_internal(dst->row, dst->col, base, stride); +} +static __inline__ void __DEFAULT_FN_ATTRS_MOVRS __tile_stream_loaddrs( + __tile1024i *dst, const void *base, __SIZE_TYPE__ stride) { + dst->tile = _tile_loaddrst1_internal(dst->row, dst->col, base, stride); +} +#undef __DEFAULT_FN_ATTRS_MOVRS +#endif /* __x86_64__ */ +#endif /* __AMXMOVRSINTRIN_H */ diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h index b3fa37d766c45b..086c9a75222ca1 100644 --- a/clang/lib/Headers/amxtransposeintrin.h +++ b/clang/lib/Headers/amxtransposeintrin.h @@ -17,6 +17,9 @@ #define __DEFAULT_FN_ATTRS_TRANSPOSE \ __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose"))) +#define __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-transpose,amx-movrs"))) #define _tile_2rpntlvwz0(tdst, base, stride) \ __builtin_ia32_t2rpntlvwz0(tdst, base, stride) @@ -26,6 +29,15 @@ __builtin_ia32_t2rpntlvwz1(tdst, base, stride) #define _tile_2rpntlvwz1t1(tdst, base, stride) \ __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride) +// MOVRS versions +#define _tile_2rpntlvwz0rs(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride) +#define _tile_2rpntlvwz0rst1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride) +#define _tile_2rpntlvwz1rs(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride) +#define _tile_2rpntlvwz1rst1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride) /// Transpose 32-bit elements from \a src and write the result to \a dst. /// @@ -101,6 +113,45 @@ _tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) { return __builtin_ia32_ttransposed_internal(m, n, src); } +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +_tile_2rpntlvwz0rs_internal(unsigned short row, unsigned short col0, + unsigned short col1, _tile1024i *dst0, + _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + // Use __tile1024i_1024a* to escape the alignment check in + // clang/test/Headers/x86-intrinsics-headers-clean.cpp + __builtin_ia32_t2rpntlvwz0rs_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +_tile_2rpntlvwz0rst1_internal(unsigned short row, unsigned short col0, + unsigned short col1, _tile1024i *dst0, + _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz0rst1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +_tile_2rpntlvwz1rs_internal(unsigned short row, unsigned short col0, + unsigned short col1, _tile1024i *dst0, + _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1rs_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +_tile_2rpntlvwz1rst1_internal(unsigned short row, unsigned short col0, + unsigned short col1, _tile1024i *dst0, + _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1rst1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + /// Converts a pair of tiles from memory into VNNI format, and places the /// results in a pair of destinations specified by dst. The pair of tiles /// in memory is specified via a tsib; the second tile is after the first @@ -229,6 +280,131 @@ static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1, &dst1->tile, base, stride); } +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS +static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/115151 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits