================ @@ -0,0 +1,94 @@ +/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error \ + "Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMX_BF16TRANSPOSEINTRIN_H +#define __AMX_BF16TRANSPOSEINTRIN_H +#ifdef __x86_64__ + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-bf16,amx-transpose"))) + +/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in +/// tiles \a a and \a b, accumulating the intermediate single-precision +/// (32-bit) floating-point elements with elements in \a dst, and store the +/// 32-bit result back to tile \a dst. +/// +/// \headerfile <immintrin.h> +/// +/// \code +/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b) +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// tmp := dst.row[m] +/// FOR k := 0 TO (a.colsb / 4) - 1 +/// FOR n := 0 TO (dst.colsb / 4) - 1 +/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) * +/// FP32(b.row[k].bf16[2*n+0]) +/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) * +/// FP32(b.row[k].bf16[2*n+1]) +/// ENDFOR +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TTDPBF16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps(dst, a, b) ---------------- fzou1 wrote:
__builtin_ia32_ttdpbf16ps(dst, a, b) -> __builtin_ia32_ttdpbf16ps((dst), (a), (b)) https://github.com/llvm/llvm-project/pull/115660 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits