Re: [PATCH v4] AArch64: Add LUTI ACLE for SVE2

Saurabh Jha Fri, 17 Jan 2025 04:20:22 -0800



On 1/16/2025 8:44 AM, Richard Sandiford wrote:

Thanks for the update.  Mostly LGTM, but some comments below:

<saurabh....@arm.com> writes:

diff --git a/gcc/config/aarch64/aarch64-sve2.md 
b/gcc/config/aarch64/aarch64-sve2.md
index f8cfe08f4c0..0a1dc314f94 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -133,6 +133,7 @@
  ;; ---- Optional AES extensions
  ;; ---- Optional SHA-3 extensions
  ;; ---- Optional SM4 extensions
+;; ---- Table lookup


This puts it under:

;; == Cryptographic extensions

but it's not a crytographic extension.  Probably better to put it under:

;; == General

instead.

  ;; =========================================================================
  ;; == Moves
@@ -4211,3 +4212,35 @@
    "sm4ekey\t%0.s, %1.s, %2.s"
    [(set_attr "type" "crypto_sm4")]
  )
+
+;; -------------------------------------------------------------------------
+;; ---- Table lookup
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - LUTI2
+;; - LUTI4
+;; -------------------------------------------------------------------------
+
+(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
+  [(set (match_operand:SVE_FULL_BH 0 "register_operand" "=w")
+       (unspec:SVE_FULL_BH
+        [(match_operand:SVE_FULL_BH 1 "register_operand" "w")
+         (match_operand:VNx16QI 2 "register_operand" "w")


This is correct

+         (match_operand:DI 3 "const_int_operand")
+         (const_int LUTI_BITS)]
+        UNSPEC_SVE_LUTI))]
+  "TARGET_LUT && TARGET_SVE2_OR_SME2"
+  "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
+)
+
+(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
+  [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
+       (unspec:<VSINGLE>
+        [(match_operand:SVE_FULL_Hx2 1 "register_operand" "Uw2")


...but this should use aligned_register_operand instead of
register_operand.

Are you sure we want it to be aligned? That requirement is not there onthe { <Zn1>.H, <Zn2>.H } operand here:https://developer.arm.com/documentation/ddi0602/2024-12/SVE-Instructions/LUTI4--Lookup-table-read-with-4-bit-indices-?lang=en,as in, nothing like it has to be "Zn times 2...."


Agree with the rest of the review.

+         (match_operand:VNx16QI 2 "register_operand" "w")
+         (match_operand:DI 3 "const_int_operand")
+         (const_int LUTI_BITS)]
+         UNSPEC_SVE_LUTI))]
+  "TARGET_LUT && TARGET_SVE2_OR_SME2"
+  "luti<LUTI_BITS>\t%0.<Vetype>, %1, %2[%3]"
+)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index ff0f34dd043..0fbf96f1ab9 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -553,6 +553,18 @@
  (define_mode_iterator SVE_FULL_BHS [VNx16QI VNx8HI VNx4SI
                                    VNx8BF VNx8HF VNx4SF])

+;; Fully-packed SVE vector byte modes that have 32-bit or smaller elements.

+(define_mode_iterator SVE_FULL_BS [VNx16QI VNx4SI VNx4SF])


This is no longer needed.

+
+;; Fully-packed SVE vector byte modes that have 16-bit or smaller elements.
+(define_mode_iterator SVE_FULL_BH [VNx16QI VNx8HI VNx8HF VNx8BF])
+
+;; Fully-packed half word SVE vector modes
+(define_mode_iterator SVE_FULL_H [VNx8HI VNx8HF VNx8BF])


Similarly, SVE_FULL_H is no longer needed.

+
+;; Pairs of fully-packed SVE vector modes (half word only)
+(define_mode_iterator SVE_FULL_Hx2 [VNx16HI VNx16HF VNx16BF])
+
  ;; Fully-packed SVE vector modes that have 32-bit elements.
  (define_mode_iterator SVE_FULL_S [VNx4SI VNx4SF])

@@ -1186,6 +1198,7 @@

      UNSPEC_UZPQ2
      UNSPEC_ZIPQ1
      UNSPEC_ZIPQ2
+    UNSPEC_SVE_LUTI

;; All used in aarch64-sme.md

      UNSPEC_SME_ADD
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
index d3ae707ac49..c0dd89fa924 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
@@ -780,4 +780,20 @@
                    "w" (z16), "w" (z22), "w" (z29));             \
    }

+#define TEST_1X2_NARROW(NAME, RTYPE, TTYPE, ZTYPE, CODE1, CODE2) \

+  PROTO(NAME, void, ())                                                        
\
+  {                                                                    \
+    register RTYPE z0 __asm ("z0");                                  \
+    register ZTYPE z5 __asm ("z5");                                  \
+    register TTYPE z6 __asm ("z6");                                  \
+    register RTYPE z16 __asm ("z16");                                        \
+    register ZTYPE z22 __asm ("z22");                                        \
+    register TTYPE z29 __asm ("z29");                                        \
+    register RTYPE z0_res __asm ("z0");                                      \
+    __asm volatile ("" : "=w" (z0), "=w" (z5), "=w" (z6),              \
+                   "=w" (z16), "=w" (z22), "=w" (z29));          \
+    INVOKE (CODE1, CODE2);                                             \
+    __asm volatile ("" :: "w" (z0_res), "w" (z5), "w" (z22));          \
+  }
+
  #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_1.c
new file mode 100644
index 00000000000..142de490267
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/lut_1.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+#pragma GCC target ("arch=armv9.2-a+sve2+lut")
+
+void
+test (svfloat16_t f16, svfloat32_t f32, svfloat64_t f64,
+      svfloat16x2_t f16x2, svfloat32x2_t f32x2, svfloat64x2_t f64x2,
+      svuint8_t u8, svuint16_t u16, svuint32_t u32, svuint64_t u64,
+      svuint8x2_t u8x2, svuint16x2_t u16x2,
+      svuint32x2_t u32x2, svuint64x2_t u64x2,
+      svint8_t s8, svint16_t s16, svint32_t s32, svint64_t s64,
+      svint8x2_t s8x2, svint16x2_t s16x2, svint32x2_t s32x2, svint64x2_t s64x2,
+      svbfloat16_t bf16, svbfloat16x2_t bf16x2)
+{


It would be good to have a test here for "too few" and "too many" arguments.

It would also be good to test cases in which non-vector arguments are passed,
such as:

   svluti2_lane (0, u8, 0);

The tests below concentrate on cases where the first argument has an
invalid type, but they don't cover any cases where the second and
third arguments have invalid types, or where the third argument is
nonconstant.  How about adding:

   svluti2_lane (f16, 0, 0);
   svluti2_lane (u16, u16, 0);
   svluti2_lane (f16, u8, u8);
   svluti2_lane (f16, u8, x);

where "x" is a new parameter of type "int".

+  svluti2_lane (f16, u8, 0);
+  svluti2_lane (bf16, u8, 0);
+
+  svluti2_lane (f32, u8, 0); /* { dg-error {'svluti2_lane' has no form that 
takes 'svfloat32_t' arguments} } */
+  svluti2_lane (f64, u8, 0); /* { dg-error {'svluti2_lane' has no form that 
takes 'svfloat64_t' arguments} } */
+
+  svluti2_lane (u8, u8, 0);
+  svluti2_lane (u16, u8, 0);
+
+  svluti2_lane (u32, u8, 0); /* { dg-error {'svluti2_lane' has no form that 
takes 'svuint32_t' arguments} } */
+  svluti2_lane (u64, u8, 0); /* { dg-error {'svluti2_lane' has no form that 
takes 'svuint64_t' arguments} } */
+
+  svluti2_lane (s8, u8, 0);
+  svluti2_lane (s16, u8, 0);
+
+  svluti2_lane (s32, u8, 0); /* { dg-error {'svluti2_lane' has no form that 
takes 'svint32_t' arguments} } */
+  svluti2_lane (s64, u8, 0); /* { dg-error {'svluti2_lane' has no form that 
takes 'svint64_t' arguments} } */
+
+  svluti4_lane (f16, u8, 0);
+  svluti4_lane (bf16, u8, 0);
+  svluti4_lane_x2 (f16x2, u8, 0);
+  svluti4_lane_x2 (bf16x2, u8, 0);


The _x2 shouldn't be present in the overloaded name.  The fix for that
is to add:

   bool explicit_group_suffix_p () const override { return false; }

to the shape class (canonically as the first thing in the class).

It would be good to test something like f16x3 as well, for the case
in which the element size is ok but the tuple size is wrong.

+
+  svluti4_lane (f32, u8, 0); /* { dg-error {'svluti4_lane' has no form that 
takes 'svfloat32_t' arguments} } */
+  svluti4_lane (f64, u8, 0); /* { dg-error {'svluti4_lane' has no form that 
takes 'svfloat64_t' arguments} } */
+  svluti4_lane_x2 (f32x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form 
that takes 'svfloat32x2_t' arguments} } */
+  svluti4_lane_x2 (f64x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form 
that takes 'svfloat64x2_t' arguments} } */
+
+  svluti4_lane (u8, u8, 0);
+  svluti4_lane (u16, u8, 0);
+  svluti4_lane_x2 (u8x2, u8, 0);
+  svluti4_lane_x2 (u16x2, u8, 0);
+
+  svluti4_lane (u32, u8, 0); /* { dg-error {'svluti4_lane' has no form that 
takes 'svuint32_t' arguments} } */
+  svluti4_lane (u64, u8, 0); /* { dg-error {'svluti4_lane' has no form that 
takes 'svuint64_t' arguments} } */
+  svluti4_lane_x2 (u32x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form 
that takes 'svuint32x2_t' arguments} } */
+  svluti4_lane_x2 (u64x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form 
that takes 'svuint64x2_t' arguments} } */
+
+  svluti4_lane (s8, u8, 0);
+  svluti4_lane (s16, u8, 0);
+  svluti4_lane_x2 (s8x2, u8, 0);
+  svluti4_lane_x2 (s16x2, u8, 0);
+
+  svluti4_lane (s32, u8, 0); /* { dg-error {'svluti4_lane' has no form that 
takes 'svint32_t' arguments} } */
+  svluti4_lane (s64, u8, 0); /* { dg-error {'svluti4_lane' has no form that 
takes 'svint64_t' arguments} } */
+  svluti4_lane_x2 (s32x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form 
that takes 'svint32x2_t' arguments} } */
+  svluti4_lane_x2 (s64x2, u8, 0); /* { dg-error {'svluti4_lane_x2' has no form 
that takes 'svint64x2_t' arguments} } */
+}
[...]
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c
new file mode 100644
index 00000000000..1f3f8aab5ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti4_bf16_x2.c
@@ -0,0 +1,30 @@
+/* { dg-do assemble { target aarch64_asm_lut_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve2+lut"
+#if STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** luti4_min_idx_test:
+**     luti4   z0\.h, \{z6\.h \- z7\.h\}, z5\[0\]
+**     ret
+*/
+
+TEST_1X2_NARROW(luti4_min_idx_test, svbfloat16_t, svbfloat16x2_t, svuint8_t,
+               z0_res = svluti4_lane_bf16_x2 (z6, z5, 0),
+               z0_res = svluti4_lane_x2 (z6, z5, 0))
+
+/*
+** luti4_max_idx_test:
+**     luti4   z0\.h, \{z6\.h \- z7\.h\}, z5\[3\]
+**     ret
+*/
+
+TEST_1X2_NARROW(luti4_max_idx_test, svbfloat16_t, svbfloat16x2_t, svuint8_t,
+               z0_res = svluti4_lane_bf16_x2 (z6, z5, 3),
+               z0_res = svluti4_lane_x2 (z6, z5, 3))


The macro has a few variations:

+    register RTYPE z0 __asm ("z0");                                  \
+    register ZTYPE z5 __asm ("z5");                                  \
+    register TTYPE z6 __asm ("z6");                                  \
+    register RTYPE z16 __asm ("z16");                                        \
+    register ZTYPE z22 __asm ("z22");                                        \
+    register TTYPE z29 __asm ("z29");                                        \
+    register RTYPE z0_res __asm ("z0");                                      \

which is good. :)  I think we should try more of them here.  In particular,
I think we should try z29, for the case in which the register is naturally
unaligned.  I think that might have caught the predicate issue mentioned
above.

Richard

Re: [PATCH v4] AArch64: Add LUTI ACLE for SVE2

Reply via email to