On Wed, Jan 08, 2025 at 11:13:41AM +0000, Richard Sandiford wrote:
> <saurabh....@arm.com> writes:
> > This patch introduces support for LUTI2/LUTI4 ACLE for SVE2.
> >
> > LUTI instructions are used for efficient table lookups with 2-bit
> > or 4-bit indices. LUTI2 reads indexed 8-bit or 16-bit elements from
> > the low 128 bits of the table vector using packed 2-bit indices,
> > while LUTI4 can read from the low 128 or 256 bits of the table
> > vector or from two table vectors using packed 4-bit indices.
> > These instructions fill the destination vector by copying elements
> > indexed by segments of the source vector, selected by the vector
> > segment index.
> >
> > The changes include the addition of a new AArch64 option
> > extension "lut", __ARM_FEATURE_LUT preprocessor macro, definitions
> > for the new LUTI instruction shapes, and implementations of the
> > svluti2 and svluti4 builtins.
> >
> > New tests are added as well.
> >
> > ---
> >
> > Hey,
> >
> > This is a respin of
> > https://gcc.gnu.org/pipermail/gcc-patches/2024-July/658015.html. Rebased
> > with master. Regression tested on aarch64-unknown-linux-gnu and found no
> > regressions.
> 
> Thanks for picking this up!  The issues described below are of course not
> your fault :)

I've spotted a couple more issues, noted below.
 
> > Ok for master?
> >
> > Thanks,
> > Saurabh
> > ---
> > [...]
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc 
> > b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
> > index ca721dd2c09..0f6d366b2d6 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
> > @@ -903,6 +903,47 @@ struct load_ext_gather_base : public overloaded_base<1>
> >    }
> >  };
> >  
> > +
> > +/* sv<v0>_t svlut_<t0>(sv<t0>_t, svuint8_t, uint64_t)
> 
> Because of the potential tuple argument, I suppose this should be:
> 
>   sv<v0>_t svlut[_<t0>_g](sv<t0>x<g>_t, svuint8_t, uint64_t)
> 
> Unlike for the ZT version of svluti, the type suffix is optional.
> To quote from the ACLE spec:
> 
>   svint8_t svluti2_lane[_s8](svint8_t table, svuint8_t indices, uint64_t 
> imm_idx);
>   svint16_t svluti2_lane[_s16]( svint16_t table, svuint8_t indices, uint64_t 
> imm_idx);
>   svint8_t svluti4_lane[_s8](svint8_t table, svuint8_t indices, uint64_t 
> imm_idx);
>   svint16_t svluti4_lane[_s16](svint16_t table, svuint8_t indices, uint64_t 
> imm_idx);
>   svint16_t svluti4_lane[_s16_x2](svint16x2_t table, svuint8_t indices, 
> uint64_t imm_idx);
> 
> Because of that:
> 
> > +   where the final argument is a constant index, the instruction divides
> > +   the vector argument in BITS-bit quantities.  */
> > +template<unsigned int BITS>
> > +struct luti_base : public nonoverloaded_base
> 
> ...this should be an overloaded function and have a resolver.
> 
> > +{
> > +  void
> > +  build (function_builder &b, const function_group_info &group) const 
> > override
> > +  {
> > +    /* Format: return type, table vector, indices vector, immediate value. 
> >  */
> > +    build_all (b, "v0,t0,vu8,su64", group, MODE_none);
> > +  }
> > +
> > +  bool
> > +  check (function_checker &c) const override
> > +  {
> > +    int max_range;
> > +    bool byte_mode = c.type_suffix (0).element_bits == 8;
> > +
> > +    if (BITS == 2)
> > +      max_range = byte_mode ? 3 : 7;
> > +    else if (BITS == 4)
> > +      max_range = byte_mode ? 1 : 7;
> 
> It looks like this should be ? 1 : 3, see:
> 
>   
> https://developer.arm.com/documentation/ddi0602/2024-12/SVE-Instructions/LUTI4--Lookup-table-read-with-4-bit-indices-?lang=en
> 
> Or, programmatically, I think this is:
> 
>   auto max_range = c.type.suffix (0).element_bits / BITS - 1;
> 
> for all cases.
> 
> > +    else
> > +      /* Unsupported number of indices bits for LUTI.  */
> > +      gcc_unreachable ();
> > +
> > +    return c.require_immediate_range (2, 0, max_range);
> > +  }
> > +
> > +};
> > +
> > +/* Specializations for 2-bit and 4-bit indices.  */
> > +using luti2_def = luti_base<2>;
> > +SHAPE (luti2)
> > +
> > +using luti4_def = luti_base<4>;
> > +SHAPE (luti4)
> > +
> > +
> >  /* sv<t0>x<g>_t svfoo_t0_g(uint64_t, svuint8_t, uint64_t)
> >  
> >     where the first argument is the ZT register number (currently always 0)
> > [...]
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def 
> > b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> > index e726fa1fb68..0c4f8251ac0 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> > @@ -164,6 +164,10 @@ DEF_SVE_FUNCTION (svwhilegt, compare_scalar, while, 
> > none)
> >  DEF_SVE_FUNCTION (svwhilerw, compare_ptr, all_data, none)
> >  DEF_SVE_FUNCTION (svwhilewr, compare_ptr, all_data, none)
> >  DEF_SVE_FUNCTION (svxar, ternary_shift_right_imm, all_integer, none)
> > +DEF_SVE_FUNCTION (svluti2_lane, luti2, bhs_data, none)
> > +DEF_SVE_FUNCTION (svluti4_lane, luti4, bhs_data, none)
> > +DEF_SVE_FUNCTION_GS (svluti4_lane, luti4, bhs_data, x2, none)
> 
> bhs_data looks wrong: there should be no .s versions.  Similarly...

This also needs gating; I think the correct condition is to prefix the new 
intrinsics with

#undef REQUIRED_EXTENSIONS
#define REQUIRED_EXTENSIONS \
  sve_and_sme (AARCH64_FL_SVE2 | AARCH64_FL_LUT, \
               AARCH64_FL_SME2 | AARCH64_FL_LUT)

> > +
> >  #undef REQUIRED_EXTENSIONS
> >  
> >  #define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE2)
> > [...]
> > diff --git a/gcc/config/aarch64/aarch64-sve2.md 
> > b/gcc/config/aarch64/aarch64-sve2.md
> > index f8cfe08f4c0..7dcbc0700da 100644
> > --- a/gcc/config/aarch64/aarch64-sve2.md
> > +++ b/gcc/config/aarch64/aarch64-sve2.md
> > @@ -133,6 +133,7 @@
> >  ;; ---- Optional AES extensions
> >  ;; ---- Optional SHA-3 extensions
> >  ;; ---- Optional SM4 extensions
> > +;; ---- Table lookup
> >  
> >  ;; 
> > =========================================================================
> >  ;; == Moves
> > @@ -4211,3 +4212,47 @@
> >    "sm4ekey\t%0.s, %1.s, %2.s"
> >    [(set_attr "type" "crypto_sm4")]
> >  )
> > +
> > +;; 
> > -------------------------------------------------------------------------
> > +;; ---- Table lookup
> > +;; 
> > -------------------------------------------------------------------------
> > +;; Includes:
> > +;; - LUTI2
> > +;; - LUTI4
> > +;; 
> > -------------------------------------------------------------------------
> > +
> > +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
> > +  [(set (match_operand:SVE_FULL_BS 0 "register_operand" "=w")
> > +   (unspec:SVE_FULL_BS
> > +     [(match_operand:SVE_FULL_BS 1 "register_operand" "w")
> > +      (match_operand:VNx16QI 2 "register_operand" "w")
> > +      (match_operand:DI 3 "const_int_operand")
> > +      (const_int LUTI_BITS)]
> > +     UNSPEC_SVE_LUTI))]
> > +  "TARGET_SVE2"
> > +  "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
> > +)

Similarly, these need to be gated on:
"TARGET_LUT && TARGET_SVE2_OR_SME2"

I've just realised that there's a similar issue with the FAMINMAX instructions
in the same file; would you be able to send a separate patch fixing those?

Thanks!

> > +
> > +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
> > +  [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
> > +   (unspec:<VSINGLE>
> > +           [(match_operand:SVE_FULL_H 1 "aligned_register_operand" "w")
> > +           (match_operand:VNx16QI 2 "register_operand" "w")
> > +           (match_operand:DI 3 "const_int_operand")
> > +           (const_int LUTI_BITS)]
> > +           UNSPEC_SVE_LUTI))]
> > +  "TARGET_SVE2"
> > +  "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]"
> > +)
> 
> ...there should be .S (VNx4) variants here.  Also, the .H variants
> don't require an aligned register operand.  (FWIW, using "w" with
> "aligned_register_operand" is wrong in any case, since "w" accepts
> unaligned registers.)
> 
> It looks like we could merge the patterns above into a single SVE_FULL_BH
> pattern.
> 
> > +
> > +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>"
> > +  [(set (match_operand:<VSINGLE> 0 "register_operand" "=w")
> > +   (unspec:<VSINGLE>
> > +           [(match_operand:SVE_FULL_Hx2 1 "aligned_register_operand" "Uw2")
> 
> This operand also isn't required to be aligned: Zn has a 5-bit encoding.
> 
> > +           (match_operand:VNx16QI 2 "register_operand" "w")
> > +           (match_operand:DI 3 "const_int_operand")
> > +           (const_int LUTI_BITS)]
> > +           UNSPEC_SVE_LUTI))]
> > +  "TARGET_SVE2"
> > +  "luti<LUTI_BITS>\t%0.<Vetype>, %1, %2[%3]"
> > +)
> 
> Also, formatting nit, but: it's more usual to indent the "[" in an
> unspec by 2 or 1 extra columns, rather than a full tab.
> 
> > [...]
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c 
> > b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c
> > new file mode 100644
> > index 00000000000..f423bfae2c6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c
> > @@ -0,0 +1,40 @@
> > +/* { dg-do compile } */
> 
> This would be better as:
> 
> /* { dg-do assemble { target aarch64_asm_lut_ok } } */
> /* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */
> 
> with lut added to:
> 
> foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod" "sve"
>                         "i8mm" "f32mm" "f64mm" "bf16" "sb" "sve2" "ls64"
>                         "sme" "sme-i16i64" "sme2" "sve-b16b16"
>                         "sme-b16b16" "sme-f16f16" "sme2p1" "fp8" "fp8fma"
>                         "ssve-fp8fma" "fp8dot2" "ssve-fp8dot2" "fp8dot4"
>                         "ssve-fp8dot4"} {
>     eval [string map [list FUNC $aarch64_ext] {
>       proc check_effective_target_aarch64_asm_FUNC_ok { } {
>         if { [istarget aarch64*-*-*] } {
>               return [check_no_compiler_messages aarch64_FUNC_assembler 
> object {
>                       __asm__ (".arch_extension FUNC");
>               } "-march=armv8-a+FUNC"]
>         } else {
>               return 0
>         }
>       }
>     }]
> }
> 
> in target-supports.exp.
> 
> > +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> > +
> > +#include "test_sve_acle.h"
> > +
> > +#pragma GCC target "+sve2+lut"
> > +#if STREAMING_COMPATIBLE
> > +#pragma GCC target "+sme2"
> > +#endif
> > +
> > +/*
> > +** luti2_test_imm0:
> > +**  luti2  z1\.h, \{ z28\.h \}, z0\[0\]
> > +**  ret
> > +*/
> > +
> > +TEST_XN_SINGLE (luti2_test_imm0, svbfloat16_t, svuint8_t, z1,
> > +                svluti2_lane_bf16 (z28, z0, 0),
> > +                svluti2_lane_bf16 (z28, z0, 0))
> 
> Following on from the comment above about these intrinsics being
> overloaded: the second call above should not have a type suffix.
> Similarly for the other tests (sorry!).
> 
> > +
> > +/*
> > +** luti2_test_imm1:
> > +**  luti2  z1\.h, \{ z28\.h \}, z0\[1\]
> > +**  ret
> > +*/
> > +
> > +TEST_XN_SINGLE (luti2_test_imm1, svbfloat16_t, svuint8_t, z1,
> > +                svluti2_lane_bf16 (z28, z0, 1),
> > +                svluti2_lane_bf16 (z28, z0, 1))
> > +
> > +/*
> > +** luti2_test_tied:
> > +**  luti2  z28\.h, \{ z28\.h \}, z0\[2\]
> > +**  ret
> > +*/
> > +
> > +TEST_XN_SINGLE (luti2_test_tied, svbfloat16_t, svuint8_t, z28,
> > +                svluti2_lane_bf16 (z28, z0, 2),
> > +                svluti2_lane_bf16 (z28, z0, 2))
> 
> I think we should test the upper bound of each range.  A combination of
> testing the upper bound + the dg-do change would have caught the ? 1 : 7
> thing above.
> 
> > [...]
> > diff --git a/gcc/testsuite/lib/target-supports.exp 
> > b/gcc/testsuite/lib/target-supports.exp
> > index 45ba2f47a9d..e0d9867801c 100644
> > --- a/gcc/testsuite/lib/target-supports.exp
> > +++ b/gcc/testsuite/lib/target-supports.exp
> > @@ -4800,6 +4800,18 @@ proc check_effective_target_aarch64_sve2 { } {
> >      }]
> >  }
> >  
> > +# Return 1 if this is an AArch64 target supporting LUT (Lookup table)
> > +proc check_effective_target_aarch64_lut { } {
> > +    if { ![istarget aarch64*-*-*] || 
> > ![check_effective_target_aarch64_sve2] } {
> > +        return 0
> > +    }
> > +    return [check_no_compiler_messages aarch64_lut assembly {
> > +        #if !defined (__ARM_FEATURE_LUT)
> > +        #error FOO
> > +        #endif
> > +    }]
> > +}
> > +
> >  # Return 1 if this is an AArch64 target only supporting SVE (not SVE2).
> >  proc check_effective_target_aarch64_sve1_only { } {
> >      return [expr { [check_effective_target_aarch64_sve]
> 
> I'm not sure this is needed.
> 
> Thanks,
> Richard

Reply via email to