llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-x86 Author: Phoebe Wang (phoebewang) <details> <summary>Changes</summary> Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368 --- Patch is 68.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115660.diff 22 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsX86_64.def (+12) - (modified) clang/lib/Headers/CMakeLists.txt (+3) - (added) clang/lib/Headers/amxbf16transposeintrin.h (+94) - (added) clang/lib/Headers/amxcomplextransposeintrin.h (+301) - (modified) clang/lib/Headers/amxfp16intrin.h (+35) - (added) clang/lib/Headers/amxfp16transposeintrin.h (+94) - (modified) clang/lib/Headers/amxintrin.h (-32) - (modified) clang/lib/Headers/immintrin.h (+19-3) - (modified) clang/lib/Sema/SemaX86.cpp (+6) - (modified) clang/test/CodeGen/X86/amx_transpose.c (+39) - (modified) clang/test/CodeGen/X86/amx_transpose_api.c (+49-1) - (modified) clang/test/CodeGen/X86/amx_transpose_errors.c (+47-3) - (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+57) - (modified) llvm/lib/Target/X86/X86ExpandPseudo.cpp (+25-3) - (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+34-24) - (modified) llvm/lib/Target/X86/X86InstrAMX.td (+89) - (modified) llvm/lib/Target/X86/X86LowerAMXType.cpp (+23-1) - (modified) llvm/lib/Target/X86/X86RegisterInfo.cpp (+7-1) - (modified) llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll (+76-1) - (modified) llvm/test/MC/Disassembler/X86/amx-transpose-att.txt (+48) - (modified) llvm/test/MC/X86/amx-transpose-att.s (+48) - (modified) llvm/test/MC/X86/amx-transpose-intel.s (+48) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index 9f7462b1e0d962..cc8637ed9c50da 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -133,6 +133,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-bf16,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_tconjtfp16_internal, "V256iUsUsV256i", "n", "amx-complex,amx-transpose") TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512") TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512") TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512") @@ -164,6 +170,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose") TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose") TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose") +TARGET_BUILTIN(__builtin_ia32_tconjtfp16, "vIUcIUc", "n", "amx-complex,amx-transpose") TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps, "V16fIUcUi", "n", "amx-avx512,avx10.2-512") TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h, "V32yIUcUi", "n", "amx-avx512,avx10.2-512") diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 76366ca1f108e9..19013d37f46ef7 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -147,8 +147,11 @@ set(x86_files adxintrin.h ammintrin.h amxavx512intrin.h + amxbf16transposeintrin.h amxcomplexintrin.h + amxcomplextransposeintrin.h amxfp16intrin.h + amxfp16transposeintrin.h amxfp8intrin.h amxintrin.h amxtransposeintrin.h diff --git a/clang/lib/Headers/amxbf16transposeintrin.h b/clang/lib/Headers/amxbf16transposeintrin.h new file mode 100644 index 00000000000000..7d31384e317988 --- /dev/null +++ b/clang/lib/Headers/amxbf16transposeintrin.h @@ -0,0 +1,94 @@ +/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error \ + "Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMX_BF16TRANSPOSEINTRIN_H +#define __AMX_BF16TRANSPOSEINTRIN_H +#ifdef __x86_64__ + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-bf16,amx-transpose"))) + +/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in +/// tiles \a a and \a b, accumulating the intermediate single-precision +/// (32-bit) floating-point elements with elements in \a dst, and store the +/// 32-bit result back to tile \a dst. +/// +/// \headerfile <immintrin.h> +/// +/// \code +/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b) +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// tmp := dst.row[m] +/// FOR k := 0 TO (a.colsb / 4) - 1 +/// FOR n := 0 TO (dst.colsb / 4) - 1 +/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) * +/// FP32(b.row[k].bf16[2*n+0]) +/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) * +/// FP32(b.row[k].bf16[2*n+1]) +/// ENDFOR +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TTDPBF16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps(dst, a, b) + +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS +_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2); +} + +/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in +/// tiles src0 and src1, accumulating the intermediate single-precision +/// (32-bit) floating-point elements with elements in "dst", and store the +/// 32-bit result back to tile "dst". +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS +static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile, + src0.tile, src1.tile); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __x86_64__ */ +#endif /* __AMX_BF16TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h new file mode 100644 index 00000000000000..06fb53e4deadcd --- /dev/null +++ b/clang/lib/Headers/amxcomplextransposeintrin.h @@ -0,0 +1,301 @@ +/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error \ + "Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead." +#endif // __IMMINTRIN_H + +#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H +#define __AMX_COMPLEXTRANSPOSEINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-complex,amx-transpose"))) + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles \a a and \a b is interpreted as a complex number +/// with FP16 real part and FP16 imaginary part. +/// Calculates the imaginary part of the result. For each possible combination +/// of (transposed column of \a a, column of \a b), it performs a set of +/// multiplication and accumulations on all corresponding complex numbers +/// (one from \a a and one from \a b). The imaginary part of the \a a element +/// is multiplied with the real part of the corresponding \a b element, and +/// the real part of the \a a element is multiplied with the imaginary part +/// of the corresponding \a b elements. The two accumulated results are +/// added, and then accumulated into the corresponding row and column of +/// \a dst. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b); +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// tmp := dst.row[m] +/// FOR k := 0 TO a.rows - 1 +/// FOR n := 0 TO (dst.colsb / 4) - 1 +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1]) +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0]) +/// ENDFOR +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_tcmmimfp16ps(dst, a, b) __builtin_ia32_ttcmmimfp16ps(dst, a, b) + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles \a a and \a b is interpreted as a complex number +/// with FP16 real part and FP16 imaginary part. +/// Calculates the real part of the result. For each possible combination +/// of (rtransposed colum of \a a, column of \a b), it performs a set of +/// multiplication and accumulations on all corresponding complex numbers +/// (one from \a a and one from \a b). The real part of the \a a element is +/// multiplied with the real part of the corresponding \a b element, and the +/// negated imaginary part of the \a a element is multiplied with the +/// imaginary part of the corresponding \a b elements. The two accumulated +/// results are added, and then accumulated into the corresponding row and +/// column of \a dst. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b); +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// tmp := dst.row[m] +/// FOR k := 0 TO a.rows - 1 +/// FOR n := 0 TO (dst.colsb / 4) - 1 +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0]) +/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1]) +/// ENDFOR +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_tcmmrlfp16ps(dst, a, b) __builtin_ia32_ttcmmrlfp16ps(dst, a, b) + +/// Perform matrix conjugate transpose and multiplication of two tiles +/// containing complex elements and accumulate the results into a packed +/// single precision tile. Each dword element in input tiles \a a and \a b +/// is interpreted as a complex number with FP16 real part and FP16 imaginary +/// part. +/// Calculates the imaginary part of the result. For each possible combination +/// of (transposed column of \a a, column of \a b), it performs a set of +/// multiplication and accumulations on all corresponding complex numbers +/// (one from \a a and one from \a b). The negated imaginary part of the \a a +/// element is multiplied with the real part of the corresponding \a b +/// element, and the real part of the \a a element is multiplied with the +/// imaginary part of the corresponding \a b elements. The two accumulated +/// results are added, and then accumulated into the corresponding row and +/// column of \a dst. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b); +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// tmp := dst.row[m] +/// FOR k := 0 TO a.rows - 1 +/// FOR n := 0 TO (dst.colsb / 4) - 1 +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1]) +/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0]) +/// ENDFOR +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_conjtcmmimfp16ps(dst, a, b) \ + __builtin_ia32_tconjtcmmimfp16ps(dst, a, b) + +/// Perform conjugate transpose of an FP16-pair of complex elements from \a a +/// and writes the result to \a dst. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// void _tile_conjtfp16(__tile dst, __tile a); +/// \endcode +/// +/// \code{.operation} +/// FOR i := 0 TO dst.rows - 1 +/// FOR j := 0 TO (dst.colsb / 4) - 1 +/// tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0] +/// tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1] +/// ENDFOR +/// write_row_and_zero(dst, i, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCONJTFP16 instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The source tile. Max size is 1024 Bytes. +#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16(dst, a) + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal( + unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, + _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2); +} + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal( + unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, + _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2); +} + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal( + unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, + _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2); +} + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtfp16_internal( + unsigned short m, unsigned short n, _tile1024i src) { + return __builtin_ia32_tconjtfp16_internal(m, n, src); +} + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles src0 and src1 is interpreted as a complex number +/// with FP16 real part and FP16 imaginary part. +/// This function calculates the imaginary part of the result. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> TTCMMIMFP16PS </c> instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS +static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col, + dst->tile, src0.tile, src1.tile); +} + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles src0 and src1 is interpreted as a complex number +/// with FP16 real part and FP16 imaginary part. +/// This function calculates the real part of the result. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> TTCMMRLFP16PS </c> instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS +static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col, + dst->tile, src0.tile, src1.tile); +} + +/// Perform matrix conjugate transpose and multiplication of two tiles +/// containing complex elements and accumulate the results into a packed +/// single precision tile. Each dword element in input tiles src0 and src1 +/// is interpreted as a complex number with FP16 real part and FP16 imaginary +/// part. +/// This function calculates the imaginary part of the result. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> TCONJTCMMIMFP16PS </c> instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS +static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col, + dst->tile, src0.tile, src1.tile); +} + +/// Perform conjugate transpose of an FP16-pair of complex elements from src and +/// writes the result to dst. +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> TCONJTFP16 </c> instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src +/// The source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS +static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) { + dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile); +} + +#undef __DEFAULT_FN_ATTRS + +#endif // __x86_64__ +#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H diff --git a/clang/lib/Headers/amxfp16intrin.h b/clang/lib/Headers/amxfp16intrin.h index ed798245d41efb..bb4bc31fdafd50 100644 --- a/clang/lib/Headers/amxfp16intrin.h +++ b/clang/lib/Headers/amxfp16intrin.h @@ -15,6 +15,10 @@ #define __AMX_FP16INTRIN_H #ifdef __x86_64__ +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16"))) + /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a /// and \a b, accumulating the intermediate single-precision (32-bit) /// floating-point elements with elements in \a dst, and store the 32-bit @@ -54,5 +58,36 @@ #define _tile_dpfp16ps(dst, a, b) \ __builtin_ia32_tdpfp16ps(dst, a, b) +/// This is internal intrinsic. C/C++ user should avoid calli... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/115660 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits