llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-mc @llvm/pr-subscribers-clang-driver Author: Alan Zhao (alanzhao1) <details> <summary>Changes</summary> Reverts llvm/llvm-project#<!-- -->114070 Reason: Causes `immintrin.h` to fail to compile if `-msse` and `-mno-sse2` are passed to clang: https://github.com/llvm/llvm-project/pull/114070#issuecomment-2465926700 --- Patch is 82.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115570.diff 31 Files Affected: - (modified) clang/docs/ReleaseNotes.rst (-1) - (modified) clang/include/clang/Basic/BuiltinsX86_64.def (-13) - (modified) clang/include/clang/Driver/Options.td (-2) - (modified) clang/lib/Basic/Targets/X86.cpp (-6) - (modified) clang/lib/Basic/Targets/X86.h (-1) - (modified) clang/lib/Headers/CMakeLists.txt (-1) - (removed) clang/lib/Headers/amxavx512intrin.h (-382) - (modified) clang/lib/Headers/immintrin.h (-4) - (modified) clang/lib/Sema/SemaX86.cpp (-6) - (removed) clang/test/CodeGen/X86/amx_avx512_api.c (-52) - (removed) clang/test/CodeGen/X86/amxavx512-builtins.c (-41) - (modified) clang/test/CodeGen/attr-target-x86.c (+4-4) - (modified) clang/test/Driver/x86-target-features.c (-7) - (modified) clang/test/Preprocessor/x86_target_features.c (-12) - (modified) llvm/include/llvm/IR/IntrinsicsX86.td (-51) - (modified) llvm/include/llvm/TargetParser/X86TargetParser.def (-1) - (modified) llvm/lib/Target/X86/X86.td (-4) - (modified) llvm/lib/Target/X86/X86ExpandPseudo.cpp (+5-62) - (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (-76) - (modified) llvm/lib/Target/X86/X86InstrAMX.td (-147) - (modified) llvm/lib/Target/X86/X86InstrPredicates.td (-1) - (modified) llvm/lib/Target/X86/X86LowerAMXType.cpp (-11) - (modified) llvm/lib/Target/X86/X86PreTileConfig.cpp (+3-15) - (modified) llvm/lib/TargetParser/Host.cpp (-1) - (modified) llvm/lib/TargetParser/X86TargetParser.cpp (-2) - (removed) llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll (-171) - (removed) llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll (-116) - (removed) llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll (-61) - (removed) llvm/test/MC/Disassembler/X86/amx-avx512.txt (-106) - (removed) llvm/test/MC/X86/amx-avx512-att.s (-105) - (removed) llvm/test/MC/X86/amx-avx512-intel.s (-105) ``````````diff diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c3424e0e6f34c9..f82fbb73b12162 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -739,7 +739,6 @@ X86 Support * Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``. - Support ISA of ``AMX-FP8``. - Support ISA of ``AMX-TRANSPOSE``. -- Support ISA of ``AMX-AVX512``. Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index 9f7462b1e0d962..d95e8455a304b6 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -133,12 +133,6 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose") -TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh_internal, "V32xUsUsV256iUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, "V32xUsUsV256iUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", "amx-avx512,avx10.2-512") // AMX TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") @@ -165,13 +159,6 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose") TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose") -TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps, "V16fIUcUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h, "V32yIUcUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l, "V32yIUcUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh, "V32xIUcUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl, "V32xIUcUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_tilemovrow, "V16iIUcUi", "n", "amx-avx512,avx10.2-512") - TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi") TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd") TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", "cmpccxadd") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 0dba5672c5a85d..8887e0c1495d2a 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6285,8 +6285,6 @@ def mno_80387 : Flag<["-"], "mno-80387">, Alias<mno_x87>; def mno_fp_ret_in_387 : Flag<["-"], "mno-fp-ret-in-387">, Alias<mno_x87>; def mmmx : Flag<["-"], "mmmx">, Group<m_x86_Features_Group>; def mno_mmx : Flag<["-"], "mno-mmx">, Group<m_x86_Features_Group>; -def mamx_avx512 : Flag<["-"], "mamx-avx512">, Group<m_x86_Features_Group>; -def mno_amx_avx512 : Flag<["-"], "mno-amx-avx512">, Group<m_x86_Features_Group>; def mamx_bf16 : Flag<["-"], "mamx-bf16">, Group<m_x86_Features_Group>; def mno_amx_bf16 : Flag<["-"], "mno-amx-bf16">, Group<m_x86_Features_Group>; def mamx_complex : Flag<["-"], "mamx-complex">, Group<m_x86_Features_Group>; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 3c3dbfa13e452b..d7d3adef42c79a 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -432,8 +432,6 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features, HasAMXFP8 = true; } else if (Feature == "+amx-transpose") { HasAMXTRANSPOSE = true; - } else if (Feature == "+amx-avx512") { - HasAMXAVX512 = true; } else if (Feature == "+cmpccxadd") { HasCMPCCXADD = true; } else if (Feature == "+raoint") { @@ -957,8 +955,6 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AMX_FP8__"); if (HasAMXTRANSPOSE) Builder.defineMacro("__AMX_TRANSPOSE__"); - if (HasAMXAVX512) - Builder.defineMacro("__AMX_AVX512__"); if (HasCMPCCXADD) Builder.defineMacro("__CMPCCXADD__"); if (HasRAOINT) @@ -1084,7 +1080,6 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { return llvm::StringSwitch<bool>(Name) .Case("adx", true) .Case("aes", true) - .Case("amx-avx512", true) .Case("amx-bf16", true) .Case("amx-complex", true) .Case("amx-fp16", true) @@ -1205,7 +1200,6 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { return llvm::StringSwitch<bool>(Feature) .Case("adx", HasADX) .Case("aes", HasAES) - .Case("amx-avx512", HasAMXAVX512) .Case("amx-bf16", HasAMXBF16) .Case("amx-complex", HasAMXCOMPLEX) .Case("amx-fp16", HasAMXFP16) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index 70047731b17295..e2eba63b992355 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -159,7 +159,6 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAMXCOMPLEX = false; bool HasAMXFP8 = false; bool HasAMXTRANSPOSE = false; - bool HasAMXAVX512 = false; bool HasSERIALIZE = false; bool HasTSXLDTRK = false; bool HasUSERMSR = false; diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 76366ca1f108e9..67242cd4d981bc 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -146,7 +146,6 @@ set(x86_files adcintrin.h adxintrin.h ammintrin.h - amxavx512intrin.h amxcomplexintrin.h amxfp16intrin.h amxfp8intrin.h diff --git a/clang/lib/Headers/amxavx512intrin.h b/clang/lib/Headers/amxavx512intrin.h deleted file mode 100644 index 945edea543e706..00000000000000 --- a/clang/lib/Headers/amxavx512intrin.h +++ /dev/null @@ -1,382 +0,0 @@ -/*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===------------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead." -#endif // __IMMINTRIN_H - -#ifndef __AMX_AVX512INTRIN_H -#define __AMX_AVX512INTRIN_H -#ifdef __x86_64__ - -#define __DEFAULT_FN_ATTRS_AVX512 \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("amx-avx512,avx10.2-512"))) - -/// Moves a row from a tile register to a zmm destination register, converting -/// the int32 source elements to fp32. The row of the tile is selected by a -/// 32b GPR. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row); -/// \endcode -/// -/// \code{.operation} -/// VL := 512 -/// VL_bytes := VL >> 3 -/// row_index := row & 0xffff -/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes -/// FOR i := 0 TO (VL_bytes / 4) - 1 -/// IF i + row_chunk / 4 >= tsrc.colsb / 4 -/// dst.dword[i] := 0 -/// ELSE -/// dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE) -/// FI -/// ENDFOR -/// dst[MAX_VL-1:VL] := 0 -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TCVTROWD2PS instruction. -/// -/// \param tsrc -/// The source tile. Max size is 1024 Bytes. -/// \param row -/// The row of the source tile -#define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row) - -/// Moves a row from a tile register to a zmm destination register, converting -/// the fp32 source elements to bf16. It places the resulting bf16 elements -/// in the high 16 bits within each dword. The row of the tile is selected -/// by a 32b GPR. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// __m512i _tile_cvtrowps2pbf16h(__tile tsrc, unsigned int row); -/// \endcode -/// -/// \code{.operation} -/// VL := 512 -/// VL_bytes := VL >> 3 -/// row_index := row & 0xffff -/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes -/// FOR i := 0 TO (VL_bytes / 4) - 1 -/// IF i + row_chunk / 4 >= tsrc.colsb / 4 -/// dst.dword[i] := 0 -/// ELSE -/// dst.word[2*i+0] := 0 -/// dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) -/// FI -/// ENDFOR -/// dst[MAX_VL-1:VL] := 0 -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TCVTROWPS2PBF16H instruction. -/// -/// \param tsrc -/// The source tile. Max size is 1024 Bytes. -/// \param row -/// The the row of the source tile. -#define _tile_cvtrowps2pbf16h(tsrc, row) \ - __builtin_ia32_tcvtrowps2pbf16h(tsrc, row) - -/// Moves a row from a tile register to a zmm destination register, converting -/// the fp32 source elements to bf16. It places the resulting bf16 elements -/// in the low 16 bits within each dword. The row of the tile is selected -/// by a 32b GPR. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// __m512i _tile_cvtrowps2pbf16l(__tile tsrc, unsigned int row); -/// \endcode -/// -/// \code{.operation} -/// VL := 512 -/// VL_bytes := VL >> 3 -/// row_index := row & 0xffff -/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes -/// FOR i := 0 TO (VL_bytes / 4) - 1 -/// IF i + row_chunk / 4 >= tsrc.colsb / 4 -/// dst.dword[i] := 0 -/// ELSE -/// dst.word[2*i+1] := 0 -/// dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) -/// FI -/// ENDFOR -/// dst[MAX_VL-1:VL] := 0 -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TCVTROWPS2PBF16L instruction. -/// -/// \param tsrc -/// The source tile. Max size is 1024 Bytes. -/// \param row -/// The the row of the source tile. -#define _tile_cvtrowps2pbf16l(tsrc, row) \ - __builtin_ia32_tcvtrowps2pbf16l(tsrc, row) - -/// Moves a row from a tile register to a zmm destination register, converting -/// the fp32 source elements to fp16. It places the resulting fp16 elements -/// in the high 16 bits within each dword. The row of the tile is selected -/// by a 32b GPR. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// __m512i _tile_cvtrowps2phh(__tile tsrc, unsigned int row); -/// \endcode -/// -/// \code{.operation} -/// VL := 512 -/// VL_bytes := VL >> 3 -/// row_index := row & 0xffff -/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes -/// FOR i := 0 TO (VL_bytes / 4) - 1 -/// IF i + row_chunk / 4 >= tsrc.colsb / 4 -/// dst.dword[i] := 0 -/// ELSE -/// dst.word[2*i+0] := 0 -/// dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) -/// FI -/// ENDFOR -/// dst[MAX_VL-1:VL] := 0 -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction. -/// -/// \param tsrc -/// The source tile. Max size is 1024 Bytes. -/// \param row -/// The the row of the source tile. -#define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row) - -/// Moves a row from a tile register to a zmm destination register, converting -/// the fp32 source elements to fp16. It places the resulting fp16 elements -/// in the low 16 bits within each dword. The row of the tile is selected -/// by a 32b GPR. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// __m512i _tile_cvtrowps2phl(__tile tsrc, unsigned int row); -/// \endcode -/// -/// \code{.operation} -/// VL := 512 -/// VL_bytes := VL >> 3 -/// row_index := row & 0xffff -/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes -/// FOR i := 0 TO (VL_bytes / 4) - 1 -/// IF i + row_chunk / 4 >= tsrc.colsb / 4 -/// dst.dword[i] := 0 -/// ELSE -/// dst.word[2*i+1] := 0 -/// dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) -/// FI -/// ENDFOR -/// dst[MAX_VL-1:VL] := 0 -/// zero_tileconfig_start() -/// \endcode -/// -/// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction. -/// -/// \param tsrc -/// The source tile. Max size is 1024 Bytes. -/// \param row -/// The the row of the source tile. -#define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row) - -/// Move one row of a tile data to a v16f32 data. -/// The row of the tile is selected by a 32b GPR. -/// -/// \headerfile <immintrin.h> -/// -/// \code -/// __m512 _tile_movrow(__tile a, unsigned b); -/// \endcode -/// -/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction. -/// -/// \param a -/// The 1st source tile. Max size is 1024 Bytes. -/// \param b -/// The 2nd source r32. Size is 4 Bytes. -/// \returns -/// The destination v16f32 data. Size is 64 Bytes. -/// -/// \code{.operation} -/// VL := 512 -/// VL_bytes := VL>>3 -/// row_index := b&0xffff -/// row_chunk := ((b>>16)&0xffff) * VL_bytes -/// FOR i := 0 TO (VL_bytes-1) -/// IF (row_chunk + i >= a.colsb) -/// dst.byte[i] := 0 -/// ELSE -/// dst.byte[i] := a.row[row_index].byte[row_chunk+i] -/// ENDFOR -/// \endcode -#define _tile_movrow(a, b) __builtin_ia32_tilemovrow(a, b) - -/// This is internal intrinsic. C/C++ user should avoid calling it directly. - -static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal( - unsigned short m, unsigned short n, _tile1024i src, unsigned u) { - return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u); -} - -static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512 -_tile_cvtrowps2pbf16h_internal(unsigned short m, unsigned short n, - _tile1024i src, unsigned u) { - return __builtin_ia32_tcvtrowps2pbf16h_internal(m, n, src, u); -} - -static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512 -_tile_cvtrowps2pbf16l_internal(unsigned short m, unsigned short n, - _tile1024i src, unsigned u) { - return __builtin_ia32_tcvtrowps2pbf16l_internal(m, n, src, u); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal( - unsigned short m, unsigned short n, _tile1024i src, unsigned u) { - return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal( - unsigned short m, unsigned short n, _tile1024i src, unsigned u) { - return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS_AVX512 _tile_movrow_internal( - unsigned short m, unsigned short n, _tile1024i src, unsigned u) { - return (__m512i)__builtin_ia32_tilemovrow_internal(m, n, src, u); -} - -/// Move a row from a tile (src0) to a v16f32 dst, converting the int32 source -/// elements to fp32. No SIMD exceptions are generated. Rounding is done as if -/// MXCSR.RC=RNE. Embedded rounding is not supported. -/// The row and chunk elements of tile is fetched from 32bit src1. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TCVTROWD2PS </c> instruction. -/// -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source r32. Size is 4 Bytes. -/// \returns -/// The destination v16f32 data. Size is 64 Bytes. -__DEFAULT_FN_ATTRS_AVX512 -static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) { - return _tile_cvtrowd2ps_internal(src0.row, src0.col, src0.tile, src1); -} - -/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source -/// elements to bf16 at high 16-bits of each dword. -/// The row and chunk elements of tile is fetched from 32bit src1. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TCVTROWPS2PBF16H </c> instruction. -/// -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source r32. Size is 4 Bytes. -/// \returns -/// The destination v32bf16 data. Size is 64 Bytes. -__DEFAULT_FN_ATTRS_AVX512 -static __m512bh __tile_cvtrowps2pbf16h(__tile1024i src0, unsigned src1) { - return _tile_cvtrowps2pbf16h_internal(src0.row, src0.col, src0.tile, src1); -} - -/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source -/// elements to bf16 at low 16-bits of each dword. -/// The row and chunk elements of tile is fetched from 32bit src1. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TCVTROWPS2PBF16L </c> instruction. -/// -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source r32. Size is 4 Bytes. -/// \returns -/// The destination v32bf16 data. Size is 64 Bytes. -__DEFAULT_FN_ATTRS_AVX512 -static __m512bh __tile_cvtrowps2pbf16l(__tile1024i src0, unsigned src1) { - return _tile_cvtrowps2pbf16l_internal(src0.row, src0.col, src0.tile, src1); -} - -/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source -/// elements to fp16 at high 16-bits of each dword. -/// The row and chunk elements of tile is fetched from 32bit src1. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TCVTROWPS2PHH </c> instruction. -/// -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source r32. Size is 4 Bytes. -/// \returns -/// The destination v32fp16 data. Size is 64 Bytes. -__DEFAULT_FN_ATTRS_AVX512 -static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) { - return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1); -} - -/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source -/// elements to fp16 at low 16-bits of each dword. -/// The row and chunk elements of tile is fetched from 32bit src1. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TCVTROWPS2PHL </c> instruction. -/// -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source r32. Size is 4 Bytes. -/// \returns -/// The destination v32fp16 data. Size is 64 Bytes. -__DEFAULT_FN_ATTRS_AVX512 -static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) { - return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1); -} - -/// Move one row of a tile data to a v16f32 data. -/// The row of the tile is selected by a 32b GPR. -/// -/// \headerfile <immintrin.h> -/// -/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction. -/// -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source r32. Size is 4 Bytes. -/// \returns -/// The destination v16i32 data. Size is 64 Bytes. -__DEFAULT_FN_ATTRS_AVX512 -static __m512i __tile_movrow(__tile1024i src0, unsigned src1) { - return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1); -} - -#endif // __x86_64__ -#endif // __AMX_AVX512INTRIN_H diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index bc240e28d59142..4bf7eac4195eec 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -656,10 +656,6 @@ _storebe_i64(void * __P, long long __D) { #include <amxtransposeintrin.h> #endif -#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_AVX512__) -#include <amxavx512intrin.h> -#endif - #if !defined(__SCE__) || __has_feature(modules) || \ defined(__AVX512VP2INTERSECT__) #include <avx512vp2intersectintrin.h> diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index 1155a5edc73c34..ef878d16d445fd 100644 --- a/clan... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/115570 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits