[PATCH v2 2/3] aarch64: Add support for fp8dot2 and fp8dot4

saurabh.jha Thu, 14 Nov 2024 02:22:27 -0800

The AArch64 FEAT_FP8DOT2 and FEAT_FP8DOT4 extension introduces
instructions for dot product of vectors.


This patch introduces the following intrinsics:
1. vdot{q}_{fp16|fp32}_mf8_fpm.
2. vdot{q}_lane{q}_{fp16|fp32}_mf8_fpm.

It introduces two flags: fp8dot2 and fp8dot4.

We had to add space for another type in aarch64_pragma_builtins_data
struct. The macros were updated to reflect that.

We added a new aarch64_builtin_signature variant, quaternary, and added
support for it in the functions aarch64_fntype and
aarch64_expand_pragma_builtin.

We added a new namespace, function_checker, to implement range checks
for functions defined using the new pragma approach. The old intrinsic
range checks will continue to work. All the new AdvSIMD intrinsics we
define that need lane checks should be using the function in this
namespace to implement the checks.

gcc/ChangeLog:

        * config/aarch64/aarch64-builtins.cc
        (ENTRY): Change to handle extra type.
        (enum class): Added new variant.
        (struct aarch64_pragma_builtins_data): Add support for another
        type.
        (aarch64_get_number_of_args): Handle new signature.
        (require_integer_constant): New function to check whether the
        operand is an integer constant.
        (require_immediate_range): New function to validate index
        ranges.
        (check_simd_lane_bounds): New function to validate index
        operands.
        (aarch64_general_check_builtin_call): Call
        function_checker::check-simd_lane_bounds.
        (aarch64_expand_pragma_builtin): Handle new signature.
        * config/aarch64/aarch64-c.cc
        (aarch64_update_cpp_builtins): New flags.
        * config/aarch64/aarch64-option-extensions.def
        (AARCH64_OPT_EXTENSION): New flags.
        * config/aarch64/aarch64-simd-pragma-builtins.def
        (ENTRY_BINARY): Change to handle extra type.
        (ENTRY_BINARY_FPM): Change to handle extra type.
        (ENTRY_UNARY_FPM): Change to handle extra type.
        (ENTRY_TERNARY_FPM_LANE): Macro to declare fpm ternary with
        lane intrinsics.
        (ENTRY_VDOT_FPM): Macro to declare vdot intrinsics.
        (REQUIRED_EXTENSIONS): Define to declare functions behind
        command line flags.
        * config/aarch64/aarch64-simd.md:
        (@aarch64_<fpm_uns_op><VHF:mode><VHF:mode><VB:mode><VB:mode>):
        Instruction pattern for vdot2 intrinsics.
        
(@aarch64_<fpm_uns_op><VHF:mode><VHF:mode><VB:mode><VB2:mode><SI_ONLY:mode>):
        Instruction pattern for vdot2 intrinsics with lane.
        (@aarch64_<fpm_uns_op><VDQSF:mode><VDQSF:mode><VB:mode><VB:mode>):
        Instruction pattern for vdot4 intrinsics.
        
(@aarch64_<fpm_uns_op><VDQSF:mode><VDQSF:mode><VB:mode><VB2:mode><SI_ONLY:mode>):
        Instruction pattern for vdo4 intrinsics with lane.
        * config/aarch64/aarch64.h
        (TARGET_FP8DOT2): New flag for fp8dot2 instructions.
        (TARGET_FP8DOT4): New flag for fp8dot4 instructions.
        * config/aarch64/iterators.md: New attributes and iterators.
        * doc/invoke.texi: New flag for fp8dot2 and fp8dot4
        instructions.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/simd/vdot2_fpmdot.c: New test.
        * gcc.target/aarch64/simd/vdot4_fpmdot.c: New test.
---
 gcc/config/aarch64/aarch64-builtins.cc        | 107 +++++++++++++++++-
 gcc/config/aarch64/aarch64-c.cc               |   4 +
 .../aarch64/aarch64-option-extensions.def     |   4 +
 .../aarch64/aarch64-simd-pragma-builtins.def  |  39 +++++--
 gcc/config/aarch64/aarch64-simd.md            |  58 ++++++++++
 gcc/config/aarch64/aarch64.h                  |   6 +
 gcc/config/aarch64/iterators.md               |  19 +++-
 gcc/doc/invoke.texi                           |   4 +
 .../gcc.target/aarch64/simd/vdot2_fpmdot.c    |  77 +++++++++++++
 .../gcc.target/aarch64/simd/vdot4_fpmdot.c    |  77 +++++++++++++
 10 files changed, 380 insertions(+), 15 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c

diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index 9b7280a30d0..a71c8c9a64e 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -780,7 +780,7 @@ typedef struct
   AARCH64_SIMD_BUILTIN_##T##_##N##A,
 
 #undef ENTRY
-#define ENTRY(N, S, M0, M1, M2, M3, USES_FPMR, U)	\
+#define ENTRY(N, S, M0, M1, M2, M3, M4, USES_FPMR, U)	\
   AARCH64_##N,
 
 enum aarch64_builtins
@@ -1590,9 +1590,10 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma)
 
 enum class aarch64_builtin_signatures
 {
+  unary,
   binary,
   ternary,
-  unary,
+  quaternary,
 };
 
 namespace {
@@ -1617,6 +1618,7 @@ namespace simd_types {
   constexpr simd_type s16q { V8HImode, qualifier_none };
   constexpr simd_type u16q { V8HImode, qualifier_unsigned };
 
+  constexpr simd_type s32_index { SImode, qualifier_lane_index };
   constexpr simd_type s32 { V2SImode, qualifier_none };
   constexpr simd_type s32q { V4SImode, qualifier_none };
 
@@ -1642,10 +1644,10 @@ namespace simd_types {
 }
 
 #undef ENTRY
-#define ENTRY(N, S, T0, T1, T2, T3, USES_FPMR, U)		      \
+#define ENTRY(N, S, T0, T1, T2, T3, T4, USES_FPMR, U)		      \
   {#N, aarch64_builtin_signatures::S, simd_types::T0, simd_types::T1, \
-      simd_types::T2, simd_types::T3, U, USES_FPMR,		      \
-      aarch64_required_extensions::REQUIRED_EXTENSIONS},
+      simd_types::T2, simd_types::T3, simd_types::T4, U,	      \
+      USES_FPMR, aarch64_required_extensions::REQUIRED_EXTENSIONS},
 
 /* Initialize pragma builtins.  */
 
@@ -1653,7 +1655,7 @@ struct aarch64_pragma_builtins_data
 {
   const char *name;
   aarch64_builtin_signatures signature;
-  simd_type types[4];
+  simd_type types[5];
   int unspec;
   bool uses_fpmr;
   aarch64_required_extensions required_extensions;
@@ -1672,6 +1674,8 @@ aarch64_get_number_of_args (const aarch64_pragma_builtins_data &builtin_data)
     return 2;
   else if (builtin_data.signature == aarch64_builtin_signatures::ternary)
     return 3;
+  else if (builtin_data.signature == aarch64_builtin_signatures::quaternary)
+    return 4;
   else
     // No other signature supported.
     gcc_unreachable ();
@@ -2504,6 +2508,72 @@ aarch64_general_required_extensions (unsigned int code)
   return ext::streaming_compatible (0);
 }
 
+namespace function_checker {
+
+void
+require_integer_constant (location_t location, tree arg)
+{
+  if (TREE_CODE (arg) != INTEGER_CST)
+    {
+      error_at (location, "Constant-type integer argument expected");
+      return;
+    }
+}
+
+void
+require_immediate_range (location_t location, tree arg, HOST_WIDE_INT min,
+			 HOST_WIDE_INT max)
+{
+  if (wi::to_widest (arg) < min || wi::to_widest (arg) > max)
+    {
+      error_at (location, "lane out of range %wd - %wd", min, max);
+      return;
+    }
+}
+
+/* Validates indexing into a vector using the index's size and the instruction,
+   where instruction is represented by the unspec.
+   This only works for intrinsics declared using pragmas in
+   aarch64-simd-pragma-builtins.def.  */
+
+void
+check_simd_lane_bounds (location_t location, const aarch64_pragma_builtins_data
+			*builtin_data, tree *args)
+{
+  if (builtin_data == NULL)
+    // Don't check for functions that are not declared in
+    // aarch64-simd-pragma-builtins.def.
+    return;
+
+  auto nargs = aarch64_get_number_of_args (*builtin_data);
+  switch (builtin_data->unspec)
+    {
+    case UNSPEC_VDOT2:
+    case UNSPEC_VDOT4:
+      {
+	if (builtin_data->types[nargs].qualifiers != qualifier_lane_index)
+	  break;
+
+	auto index_arg = args[nargs - 1];
+	require_integer_constant (location, index_arg);
+
+	auto vector_to_index_mode = builtin_data->types[nargs - 1].mode;
+	int vector_to_index_mode_size
+	  = GET_MODE_NUNITS (vector_to_index_mode).to_constant ();
+
+	auto low = 0;
+	int high
+	  = builtin_data->unspec == UNSPEC_VDOT2
+	  ? vector_to_index_mode_size / 2 - 1
+	  : vector_to_index_mode_size / 4 - 1;
+	require_immediate_range (location, index_arg, low, high);
+	break;
+      }
+    }
+}
+
+};
+
 bool
 aarch64_general_check_builtin_call (location_t location, vec<location_t>,
 				    unsigned int code, tree fndecl,
@@ -2515,6 +2585,9 @@ aarch64_general_check_builtin_call (location_t location, vec<location_t>,
   if (!aarch64_check_required_extensions (location, decl, required_extensions))
     return false;
 
+  auto builtin_data = aarch64_get_pragma_builtin (code);
+  function_checker::check_simd_lane_bounds (location, builtin_data, args);
+
   switch (code)
     {
     case AARCH64_RSR:
@@ -3477,6 +3550,28 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
       expand_insn (icode, nargs + 1, ops);
       break;
 
+    case UNSPEC_VDOT2:
+    case UNSPEC_VDOT4:
+      if (builtin_data->signature == aarch64_builtin_signatures::ternary)
+	icode = code_for_aarch64 (builtin_data->unspec,
+				  builtin_data->types[0].mode,
+				  builtin_data->types[1].mode,
+				  builtin_data->types[2].mode,
+				  builtin_data->types[3].mode);
+      else if
+	(builtin_data->signature == aarch64_builtin_signatures::quaternary)
+	icode = code_for_aarch64 (builtin_data->unspec,
+				  builtin_data->types[0].mode,
+				  builtin_data->types[1].mode,
+				  builtin_data->types[2].mode,
+				  builtin_data->types[3].mode,
+				  builtin_data->types[4].mode);
+      else
+	gcc_unreachable ();
+
+      expand_insn (icode, nargs + 1, ops);
+      break;
+
     default:
       gcc_unreachable ();
     }
diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
index b13366b0621..ae1472e0fcf 100644
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -260,6 +260,10 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
 
   aarch64_def_or_undef (TARGET_FP8, "__ARM_FEATURE_FP8", pfile);
 
+  aarch64_def_or_undef (TARGET_FP8DOT2, "__ARM_FEATURE_FP8DOT2", pfile);
+
+  aarch64_def_or_undef (TARGET_FP8DOT4, "__ARM_FEATURE_FP8DOT4", pfile);
+
   aarch64_def_or_undef (TARGET_LS64,
 			"__ARM_FEATURE_LS64", pfile);
   aarch64_def_or_undef (TARGET_RCPC, "__ARM_FEATURE_RCPC", pfile);
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index c9d419afc8f..44d2e18d46b 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -236,6 +236,10 @@ AARCH64_OPT_EXTENSION("gcs", GCS, (), (), (), "gcs")
 
 AARCH64_OPT_EXTENSION("fp8", FP8, (SIMD), (), (), "fp8")
 
+AARCH64_OPT_EXTENSION("fp8dot2", FP8DOT2, (SIMD), (), (), "fp8dot2")
+
+AARCH64_OPT_EXTENSION("fp8dot4", FP8DOT4, (SIMD), (), (), "fp8dot4")
+
 AARCH64_OPT_EXTENSION("faminmax", FAMINMAX, (SIMD), (), (), "faminmax")
 
 #undef AARCH64_OPT_FMV_EXTENSION
diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
index 91897cffcd8..4a94a6613f0 100644
--- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
@@ -20,20 +20,33 @@
 
 
 #undef ENTRY_BINARY
-#define ENTRY_BINARY(N, T0, T1, T2, U)		\
-  ENTRY (N, binary, T0, T1, T2, none, false, U)
+#define ENTRY_BINARY(N, T0, T1, T2, U)			\
+  ENTRY (N, binary, T0, T1, T2, none, none, false, U)
 
 #undef ENTRY_BINARY_FPM
-#define ENTRY_BINARY_FPM(N, T0, T1, T2, U)	\
-  ENTRY (N, binary, T0, T1, T2, none, true, U)
+#define ENTRY_BINARY_FPM(N, T0, T1, T2, U)		\
+  ENTRY (N, binary, T0, T1, T2, none, none, true, U)
 
 #undef ENTRY_TERNARY_FPM
-#define ENTRY_TERNARY_FPM(N, T0, T1, T2, T3, U) \
-  ENTRY (N, ternary, T0, T1, T2, T3, true, U)
+#define ENTRY_TERNARY_FPM(N, T0, T1, T2, T3, U)		\
+  ENTRY (N, ternary, T0, T1, T2, T3, none, true, U)
+
+#undef ENTRY_TERNARY_FPM_LANE
+#define ENTRY_TERNARY_FPM_LANE(N, T0, T1, T2, T3, U)	\
+  ENTRY (N, quaternary, T0, T1, T2, T3, s32_index, true, U)
 
 #undef ENTRY_UNARY_FPM
-#define ENTRY_UNARY_FPM(N, T0, T1, U) \
-  ENTRY (N, unary, T0, T1, none, none, true, U)
+#define ENTRY_UNARY_FPM(N, T0, T1, U)			\
+  ENTRY (N, unary, T0, T1, none, none, none, true, U)
+
+#undef ENTRY_VDOT_FPM
+#define ENTRY_VDOT_FPM(T, U)						\
+  ENTRY_TERNARY_FPM (vdot_##T##_mf8_fpm, T, T, f8, f8, U)		\
+  ENTRY_TERNARY_FPM (vdotq_##T##_mf8_fpm, T##q, T##q, f8q, f8q, U)	\
+  ENTRY_TERNARY_FPM_LANE (vdot_lane_##T##_mf8_fpm, T, T, f8, f8, U)	\
+  ENTRY_TERNARY_FPM_LANE (vdot_laneq_##T##_mf8_fpm, T, T, f8, f8q, U)	\
+  ENTRY_TERNARY_FPM_LANE (vdotq_lane_##T##_mf8_fpm, T##q, T##q, f8q, f8, U) \
+  ENTRY_TERNARY_FPM_LANE (vdotq_laneq_##T##_mf8_fpm, T##q, T##q, f8q, f8q, U)
 
 #undef ENTRY_VHSDF
 #define ENTRY_VHSDF(NAME, UNSPEC) \
@@ -83,3 +96,13 @@ ENTRY_TERNARY_FPM (vcvt_high_mf8_f32_fpm, f8q, f8, f32q, f32q, UNSPEC_VCVT_HIGH)
 #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
 ENTRY_VHSDF_VHSDI (vscale, UNSPEC_FSCALE)
 #undef REQUIRED_EXTENSIONS
+
+// fpm dot2 product
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8DOT2)
+ENTRY_VDOT_FPM (f16, UNSPEC_VDOT2)
+#undef REQUIRED_EXTENSIONS
+
+// fpm dot4 product
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8DOT4)
+ENTRY_VDOT_FPM (f32, UNSPEC_VDOT4)
+#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f8437469a7e..7b974865f55 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -10097,3 +10097,61 @@
   "TARGET_FP8"
   "<fpm_uns_op>\t%0.<VHSDF:Vtype>, %1.<VHSDF:Vtype>, %2.<VHSDI:Vtype>"
 )
+
+;; fpm vdot2 instructions.
+(define_insn
+  "@aarch64_<fpm_uns_op><VHF:mode><VHF:mode><VB:mode><VB:mode>"
+  [(set (match_operand:VHF 0 "register_operand" "=w")
+	(unspec:VHF
+	 [(match_operand:VHF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB 3 "register_operand" "w")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT2_UNS))]
+  "TARGET_FP8DOT2"
+  "<fpm_uns_op>\t%1.<VHF:Vtype>, %2.<VB:Vtype>, %3.<VB:Vtype>"
+)
+
+;; fpm vdot2 instructions with lane.
+(define_insn
+  "@aarch64_<fpm_uns_op><VHF:mode><VHF:mode><VB:mode><VB2:mode><SI_ONLY:mode>"
+  [(set (match_operand:VHF 0 "register_operand" "=w")
+	(unspec:VHF
+	 [(match_operand:VHF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB2 3 "register_operand" "w")
+	  (match_operand:SI_ONLY 4 "const_int_operand" "n")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT2_UNS))]
+  "TARGET_FP8DOT2"
+  "<fpm_uns_op>\t%1.<VHF:Vtype>, %2.<VB:Vtype>, %3.<VHF:Vdotlanetype>[%4]"
+)
+
+;; fpm vdot4 instructions.
+(define_insn
+  "@aarch64_<fpm_uns_op><VDQSF:mode><VDQSF:mode><VB:mode><VB:mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(unspec:VDQSF
+	 [(match_operand:VDQSF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB 3 "register_operand" "w")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT4_UNS))]
+  "TARGET_FP8DOT4"
+  "<fpm_uns_op>\t%1.<VDQSF:Vtype>, %2.<VB:Vtype>, %3.<VB:Vtype>"
+)
+
+;; fpm vdot4 instructions with lane.
+(define_insn
+  "@aarch64_<fpm_uns_op><VDQSF:mode><VDQSF:mode><VB:mode><VB2:mode><SI_ONLY:mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(unspec:VDQSF
+	 [(match_operand:VDQSF 1 "register_operand" "w")
+	  (match_operand:VB 2 "register_operand" "w")
+	  (match_operand:VB2 3 "register_operand" "w")
+	  (match_operand:SI_ONLY 4 "const_int_operand" "n")
+	  (reg:DI FPM_REGNUM)]
+	FPM_VDOT4_UNS))]
+  "TARGET_FP8DOT4"
+  "<fpm_uns_op>\t%1.<VDQSF:Vtype>, %2.<VB:Vtype>, %3.<VDQSF:Vdotlanetype>[%4]"
+)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index f07b2c49f0d..c50a578731a 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -494,6 +494,12 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
   ((TARGET_SVE2p1 || TARGET_STREAMING) \
    && (TARGET_SME2 || TARGET_NON_STREAMING))
 
+/* fp8 dot product instructions are enabled through +fp8dot2.  */
+#define TARGET_FP8DOT2 AARCH64_HAVE_ISA (FP8DOT2)
+
+/* fp8 dot product instructions are enabled through +fp8dot4.  */
+#define TARGET_FP8DOT4 AARCH64_HAVE_ISA (FP8DOT4)
+
 /* Standard register usage.  */
 
 /* 31 64-bit general purpose registers R0-R30:
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index bdd276b554b..8c03dcd14dd 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -163,6 +163,10 @@
 
 ;; Advanced SIMD Float modes.
 (define_mode_iterator VDQF [V2SF V4SF V2DF])
+
+(define_mode_iterator VHF [(V4HF "TARGET_SIMD_F16INST")
+			   (V8HF "TARGET_SIMD_F16INST")])
+
 (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
 			     (V8HF "TARGET_SIMD_F16INST")
 			     V2SF V4SF V2DF])
@@ -321,6 +325,7 @@
 
 ;; All byte modes.
 (define_mode_iterator VB [V8QI V16QI])
+(define_mode_iterator VB2 [VB])
 
 ;; 1 and 2 lane DI and DF modes.
 (define_mode_iterator V12DIF [V1DI V1DF V2DI V2DF])
@@ -764,6 +769,8 @@
     UNSPEC_VCVT2	; Used in aarch64-simd.md.
     UNSPEC_VCVT2_HIGH	; Used in aarch64-simd.md.
     UNSPEC_VCVT2_LOW	; Used in aarch64-simd.md.
+    UNSPEC_VDOT2 	; Used in aarch64-simd.md.
+    UNSPEC_VDOT4	; Used in aarch64-simd.md.
     UNSPEC_TBL		; Used in vector permute patterns.
     UNSPEC_TBLQ		; Used in vector permute patterns.
     UNSPEC_TBX		; Used in vector permute patterns.
@@ -2491,6 +2498,11 @@
 			    (VNx8HF ".h") (VNx16HF "") (VNx32HF "")
 			    (VNx8HI ".h") (VNx16HI "") (VNx32HI "")])
 
+
+;; Lane index suffix for fp8 vdot operations depends on the output mode
+(define_mode_attr Vdotlanetype [(V4HF "2b") (V8HF "2b")
+				(V2SF "4b") (V4SF "4b")])
+
 ;; The number of bytes controlled by a predicate
 (define_mode_attr data_bytes [(VNx16BI "1") (VNx8BI "2")
 			      (VNx4BI "4") (VNx2BI "8")])
@@ -4720,7 +4732,12 @@
    (UNSPEC_VCVT2_HIGH "f2cvtl2")
    (UNSPEC_VCVT2_LOW "f2cvtl")])
 
+(define_int_iterator FPM_VDOT2_UNS [UNSPEC_VDOT2])
+(define_int_iterator FPM_VDOT4_UNS [UNSPEC_VDOT4])
+
 (define_int_attr fpm_uns_op
   [(UNSPEC_FSCALE "fscale")
    (UNSPEC_VCVT "fcvtn")
-   (UNSPEC_VCVT_HIGH "fcvtn2")])
+   (UNSPEC_VCVT_HIGH "fcvtn2")
+   (UNSPEC_VDOT2 "fdot")
+   (UNSPEC_VDOT4 "fdot")])
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 4a494f6a668..bc3f7423425 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -21807,6 +21807,10 @@ Enable support for Armv8.9-a/9.4-a translation hardening extension.
 Enable the RCpc3 (Release Consistency) extension.
 @item fp8
 Enable the fp8 (8-bit floating point) extension.
+@item fp8dot2
+Enable the fp8dot2 (8-bit floating point dot product) extension.
+@item fp8dot4
+Enable the fp8dot4 (8-bit floating point dot product) extension.
 @item faminmax
 Enable the Floating Point Absolute Maximum/Minimum extension.
 
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c
new file mode 100644
index 00000000000..3e888a67ec7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpmdot.c
@@ -0,0 +1,77 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8dot2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vdot_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4h, v1.8b, v2.8b
+**	ret
+*/
+float16x4_t
+test_vdot_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_f16_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdotq_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.8h, v1.16b, v2.16b
+**	ret
+*/
+float16x8_t
+test_vdotq_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_f16_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdot_lane_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4h, v1.8b, v2.2b\[1\]
+**	ret
+*/
+float16x4_t
+test_vdot_lane_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_lane_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdot_laneq_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4h, v1.8b, v2.2b\[1\]
+**	ret
+*/
+float16x4_t
+test_vdot_laneq_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdot_laneq_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_lane_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.8h, v1.16b, v2.2b\[1\]
+**	ret
+*/
+float16x8_t
+test_vdotq_lane_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdotq_lane_f16_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_laneq_f16_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.8h, v1.16b, v2.2b\[1\]
+**	ret
+*/
+float16x8_t
+test_vdotq_laneq_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_laneq_f16_mf8_fpm (a, b, c, 1, d);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c
new file mode 100644
index 00000000000..f03dd0a0d36
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpmdot.c
@@ -0,0 +1,77 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vdot_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.2s, v1.8b, v2.8b
+**	ret
+*/
+float32x2_t
+test_vdot_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdotq_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4s, v1.16b, v2.16b
+**	ret
+*/
+float32x4_t
+test_vdotq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdot_lane_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.2s, v1.8b, v2.4b\[1\]
+**	ret
+*/
+float32x2_t
+test_vdot_lane_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdot_laneq_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.2s, v1.8b, v2.4b\[1\]
+**	ret
+*/
+float32x2_t
+test_vdot_laneq_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdot_laneq_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_lane_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4s, v1.16b, v2.4b\[1\]
+**	ret
+*/
+float32x4_t
+test_vdotq_lane_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdotq_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_laneq_f32_fpm:
+**	msr	fpmr, x0
+**	fdot	v0.4s, v1.16b, v2.4b\[1\]
+**	ret
+*/
+float32x4_t
+test_vdotq_laneq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_laneq_f32_mf8_fpm (a, b, c, 1, d);
+}

[PATCH v2 2/3] aarch64: Add support for fp8dot2 and fp8dot4

Reply via email to