llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-mc Author: Phoebe Wang (phoebewang) <details> <summary>Changes</summary> Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368 --- Patch is 184.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113532.diff 57 Files Affected: - (modified) clang/docs/ReleaseNotes.rst (+2) - (modified) clang/include/clang/Basic/BuiltinsX86_64.def (+11) - (modified) clang/include/clang/Driver/Options.td (+2) - (modified) clang/lib/Basic/Targets/X86.cpp (+6) - (modified) clang/lib/Basic/Targets/X86.h (+1) - (modified) clang/lib/CodeGen/CGBuiltin.cpp (+52) - (modified) clang/lib/Headers/CMakeLists.txt (+1) - (modified) clang/lib/Headers/amxintrin.h (+2) - (added) clang/lib/Headers/amxtransposeintrin.h (+248) - (modified) clang/lib/Headers/immintrin.h (+4) - (modified) clang/lib/Sema/SemaX86.cpp (+6) - (added) clang/test/CodeGen/X86/amx_transpose.c (+36) - (added) clang/test/CodeGen/X86/amx_transpose_api.c (+66) - (added) clang/test/CodeGen/X86/amx_transpose_errors.c (+31) - (modified) clang/test/Driver/x86-target-features.c (+7) - (modified) clang/test/Preprocessor/x86_target_features.c (+12) - (modified) llvm/include/llvm/CodeGen/TileShapeInfo.h (+80-7) - (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+37) - (modified) llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h (+1) - (modified) llvm/include/llvm/TargetParser/X86TargetParser.def (+1) - (modified) llvm/lib/Target/X86/AsmParser/X86Operand.h (+31) - (modified) llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp (+5) - (modified) llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h (+7) - (modified) llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp (+19) - (modified) llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h (+1) - (modified) llvm/lib/Target/X86/X86.td (+3) - (modified) llvm/lib/Target/X86/X86ExpandPseudo.cpp (+125) - (modified) llvm/lib/Target/X86/X86FastPreTileConfig.cpp (+38-15) - (modified) llvm/lib/Target/X86/X86FastTileConfig.cpp (+28-12) - (modified) llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (+70) - (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+94) - (modified) llvm/lib/Target/X86/X86InstrAMX.td (+63) - (modified) llvm/lib/Target/X86/X86InstrInfo.cpp (+10-2) - (modified) llvm/lib/Target/X86/X86InstrOperands.td (+7) - (modified) llvm/lib/Target/X86/X86InstrPredicates.td (+1) - (modified) llvm/lib/Target/X86/X86LowerAMXType.cpp (+194-52) - (modified) llvm/lib/Target/X86/X86PreTileConfig.cpp (+33-12) - (modified) llvm/lib/Target/X86/X86RegisterInfo.cpp (+56-4) - (modified) llvm/lib/Target/X86/X86RegisterInfo.td (+9) - (modified) llvm/lib/Target/X86/X86TileConfig.cpp (+71-11) - (modified) llvm/lib/TargetParser/Host.cpp (+4) - (modified) llvm/lib/TargetParser/X86TargetParser.cpp (+1) - (added) llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll (+136) - (added) llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir (+165) - (added) llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir (+153) - (added) llvm/test/CodeGen/X86/amx_tile_pair_copy.mir (+97) - (added) llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll (+86) - (added) llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll (+60) - (added) llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir (+134) - (added) llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir (+113) - (added) llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll (+150) - (modified) llvm/test/CodeGen/X86/ipra-reg-usage.ll (+2-2) - (added) llvm/test/MC/Disassembler/X86/amx-transpose-att.s (+57) - (added) llvm/test/MC/Disassembler/X86/amx-transpose-att.txt (+58) - (added) llvm/test/MC/Disassembler/X86/amx-transpose-intel.s (+57) - (modified) llvm/unittests/CodeGen/InstrRefLDVTest.cpp (+3-3) - (modified) llvm/utils/TableGen/X86RecognizableInstr.cpp (+4) ``````````diff diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index ce046a305c89b6..dc58f98af55cc9 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -623,6 +623,8 @@ X86 Support - All intrinsics in tbmintrin.h can now be used in constant expressions. +- Support ISA of ``AMX-TRANSPOSE``. + Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index 2c591edb2835cd..4e95a8a73d550a 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -128,6 +128,11 @@ TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16") TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose") // AMX TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") @@ -148,6 +153,12 @@ TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite") TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex") TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0, "vIUcvC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose") +TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose") + TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi") TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd") TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiv*SLLiSLLiIi", "n", "cmpccxadd") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 2ddb2f5312148e..c55f2b86f4cb1f 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6287,6 +6287,8 @@ def mamx_int8 : Flag<["-"], "mamx-int8">, Group<m_x86_Features_Group>; def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group<m_x86_Features_Group>; def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>; def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>; +def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>; +def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>; def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>; def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group<m_x86_Features_Group>; def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 5448bd841959f4..fe5b600e6777fb 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -418,6 +418,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features, HasAMXTILE = true; } else if (Feature == "+amx-complex") { HasAMXCOMPLEX = true; + } else if (Feature == "+amx-transpose") { + HasAMXTRANSPOSE = true; } else if (Feature == "+cmpccxadd") { HasCMPCCXADD = true; } else if (Feature == "+raoint") { @@ -935,6 +937,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AMX_FP16__"); if (HasAMXCOMPLEX) Builder.defineMacro("__AMX_COMPLEX__"); + if (HasAMXTRANSPOSE) + Builder.defineMacro("__AMX_TRANSPOSE__"); if (HasCMPCCXADD) Builder.defineMacro("__CMPCCXADD__"); if (HasRAOINT) @@ -1065,6 +1069,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("amx-fp16", true) .Case("amx-int8", true) .Case("amx-tile", true) + .Case("amx-transpose", true) .Case("avx", true) .Case("avx10.1-256", true) .Case("avx10.1-512", true) @@ -1182,6 +1187,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("amx-fp16", HasAMXFP16) .Case("amx-int8", HasAMXINT8) .Case("amx-tile", HasAMXTILE) + .Case("amx-transpose", HasAMXTRANSPOSE) .Case("avx", SSELevel >= AVX) .Case("avx10.1-256", HasAVX10_1) .Case("avx10.1-512", HasAVX10_1_512) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index a99ae62984c7d5..3e1fb41082950c 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -156,6 +156,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAMXINT8 = false; bool HasAMXBF16 = false; bool HasAMXCOMPLEX = false; + bool HasAMXTRANSPOSE = false; bool HasSERIALIZE = false; bool HasTSXLDTRK = false; bool HasUSERMSR = false; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 3f28b7f26c36fe..67d28ccec0f373 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -16920,6 +16920,58 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // instruction, but it will create a memset that won't be optimized away. return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true); } + // Corresponding to intrisics which will return 2 tiles (tile0_tile1). + case X86::BI__builtin_ia32_t2rpntlvwz0_internal: + case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: { + Intrinsic::ID IID; + switch (BuiltinID) { + default: + llvm_unreachable("Unsupported intrinsic!"); + case X86::BI__builtin_ia32_t2rpntlvwz0_internal: + IID = Intrinsic::x86_t2rpntlvwz0_internal; + break; + case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: + IID = Intrinsic::x86_t2rpntlvwz0t1_internal; + break; + case X86::BI__builtin_ia32_t2rpntlvwz1_internal: + IID = Intrinsic::x86_t2rpntlvwz1_internal; + break; + case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: + IID = Intrinsic::x86_t2rpntlvwz1t1_internal; + break; + } + + // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride) + Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), + {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]}); + + auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>(); + assert(PtrTy && "arg3 must be of pointer type"); + QualType PtreeTy = PtrTy->getPointeeType(); + llvm::Type *TyPtee = ConvertType(PtreeTy); + + // Bitcast amx type (x86_amx) to vector type (256 x i32) + // Then store tile0 into DstPtr0 + Value *T0 = Builder.CreateExtractValue(Call, 0); + Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, + {TyPtee}, {T0}); + Builder.CreateDefaultAlignedStore(VecT0, Ops[3]); + + // Then store tile1 into DstPtr1 + Value *T1 = Builder.CreateExtractValue(Call, 1); + Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, + {TyPtee}, {T1}); + Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]); + + // Note: Here we escape directly use x86_tilestored64_internal to store + // the results due to it can't make sure the Mem writen scope. This may + // cause shapes reloads after first amx intrinsic, which current amx reg- + // ister allocation has no ability to handle it. + + return Store; + } case X86::BI__ud2: // llvm.trap makes a ud2a instruction on x86. return EmitTrapCall(Intrinsic::trap); diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index ff392e7122a448..708525198324bb 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -149,6 +149,7 @@ set(x86_files amxcomplexintrin.h amxfp16intrin.h amxintrin.h + amxtransposeintrin.h avx10_2_512bf16intrin.h avx10_2_512convertintrin.h avx10_2_512minmaxintrin.h diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h index baa56f5b28e8e5..f07a5689011853 100644 --- a/clang/lib/Headers/amxintrin.h +++ b/clang/lib/Headers/amxintrin.h @@ -232,6 +232,8 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { /// bytes. Since there is no 2D type in llvm IR, we use vector type to /// represent 2D tile and the fixed size is maximum amx tile register size. typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); +typedef int _tile1024i_1024a + __attribute__((__vector_size__(1024), __aligned__(1024))); /// This is internal intrinsic. C/C++ user should avoid calling it directly. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h new file mode 100644 index 00000000000000..d5dc68f4152848 --- /dev/null +++ b/clang/lib/Headers/amxtransposeintrin.h @@ -0,0 +1,248 @@ +/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * ===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMX_TRANSPOSEINTRIN_H +#define __AMX_TRANSPOSEINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_TRANSPOSE \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose"))) + +#define _tile_2rpntlvwz0(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0(tdst, base, stride) +#define _tile_2rpntlvwz0t1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride) +#define _tile_2rpntlvwz1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1(tdst, base, stride) +#define _tile_2rpntlvwz1t1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride) + +/// Transpose 32-bit elements from \a src and write the result to \a dst. +/// +/// \headerfile <immintrin.h> +/// +/// \code +/// void __tile_transposed(__tile dst, __tile src); +/// \endcode +/// +/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src +/// The 1st source tile. Max size is 1024 Bytes. +/// +/// \code{.operation} +/// +/// FOR i := 0 TO (dst.rows-1) +/// tmp[511:0] := 0 +/// FOR j := 0 TO (dst.colsb/4-1) +/// tmp.dword[j] := src.row[j].dword[i] +/// ENDFOR +/// dst.row[i] := tmp +/// ENDFOR +/// +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src) + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + // Use __tile1024i_1024a* to escape the alignment check in + // clang/test/Headers/x86-intrinsics-headers-clean.cpp + __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0, + (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz0t1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0, + (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1t1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE +_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) { + return __builtin_ia32_ttransposed_internal(m, n, src); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// Provides a hint to the implementation that the data will likely not be +/// reused in the near future and the data caching can be optimized. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely not be +/// reused in the near future and the data caching can be optimized. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely not be +/// reused in the near future and the data caching can be optimized. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Transpose 32-bit elements from src and write the result to dst. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic cor... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/113532 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits