On Wed, Jan 08, 2025 at 11:13:41AM +0000, Richard Sandiford wrote: > <saurabh....@arm.com> writes: > > This patch introduces support for LUTI2/LUTI4 ACLE for SVE2. > > > > LUTI instructions are used for efficient table lookups with 2-bit > > or 4-bit indices. LUTI2 reads indexed 8-bit or 16-bit elements from > > the low 128 bits of the table vector using packed 2-bit indices, > > while LUTI4 can read from the low 128 or 256 bits of the table > > vector or from two table vectors using packed 4-bit indices. > > These instructions fill the destination vector by copying elements > > indexed by segments of the source vector, selected by the vector > > segment index. > > > > The changes include the addition of a new AArch64 option > > extension "lut", __ARM_FEATURE_LUT preprocessor macro, definitions > > for the new LUTI instruction shapes, and implementations of the > > svluti2 and svluti4 builtins. > > > > New tests are added as well. > > > > --- > > > > Hey, > > > > This is a respin of > > https://gcc.gnu.org/pipermail/gcc-patches/2024-July/658015.html. Rebased > > with master. Regression tested on aarch64-unknown-linux-gnu and found no > > regressions. > > Thanks for picking this up! The issues described below are of course not > your fault :)
I've spotted a couple more issues, noted below. > > Ok for master? > > > > Thanks, > > Saurabh > > --- > > [...] > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc > > b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc > > index ca721dd2c09..0f6d366b2d6 100644 > > --- a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc > > +++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc > > @@ -903,6 +903,47 @@ struct load_ext_gather_base : public overloaded_base<1> > > } > > }; > > > > + > > +/* sv<v0>_t svlut_<t0>(sv<t0>_t, svuint8_t, uint64_t) > > Because of the potential tuple argument, I suppose this should be: > > sv<v0>_t svlut[_<t0>_g](sv<t0>x<g>_t, svuint8_t, uint64_t) > > Unlike for the ZT version of svluti, the type suffix is optional. > To quote from the ACLE spec: > > svint8_t svluti2_lane[_s8](svint8_t table, svuint8_t indices, uint64_t > imm_idx); > svint16_t svluti2_lane[_s16]( svint16_t table, svuint8_t indices, uint64_t > imm_idx); > svint8_t svluti4_lane[_s8](svint8_t table, svuint8_t indices, uint64_t > imm_idx); > svint16_t svluti4_lane[_s16](svint16_t table, svuint8_t indices, uint64_t > imm_idx); > svint16_t svluti4_lane[_s16_x2](svint16x2_t table, svuint8_t indices, > uint64_t imm_idx); > > Because of that: > > > + where the final argument is a constant index, the instruction divides > > + the vector argument in BITS-bit quantities. */ > > +template<unsigned int BITS> > > +struct luti_base : public nonoverloaded_base > > ...this should be an overloaded function and have a resolver. > > > +{ > > + void > > + build (function_builder &b, const function_group_info &group) const > > override > > + { > > + /* Format: return type, table vector, indices vector, immediate value. > > */ > > + build_all (b, "v0,t0,vu8,su64", group, MODE_none); > > + } > > + > > + bool > > + check (function_checker &c) const override > > + { > > + int max_range; > > + bool byte_mode = c.type_suffix (0).element_bits == 8; > > + > > + if (BITS == 2) > > + max_range = byte_mode ? 3 : 7; > > + else if (BITS == 4) > > + max_range = byte_mode ? 1 : 7; > > It looks like this should be ? 1 : 3, see: > > > https://developer.arm.com/documentation/ddi0602/2024-12/SVE-Instructions/LUTI4--Lookup-table-read-with-4-bit-indices-?lang=en > > Or, programmatically, I think this is: > > auto max_range = c.type.suffix (0).element_bits / BITS - 1; > > for all cases. > > > + else > > + /* Unsupported number of indices bits for LUTI. */ > > + gcc_unreachable (); > > + > > + return c.require_immediate_range (2, 0, max_range); > > + } > > + > > +}; > > + > > +/* Specializations for 2-bit and 4-bit indices. */ > > +using luti2_def = luti_base<2>; > > +SHAPE (luti2) > > + > > +using luti4_def = luti_base<4>; > > +SHAPE (luti4) > > + > > + > > /* sv<t0>x<g>_t svfoo_t0_g(uint64_t, svuint8_t, uint64_t) > > > > where the first argument is the ZT register number (currently always 0) > > [...] > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def > > b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def > > index e726fa1fb68..0c4f8251ac0 100644 > > --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def > > +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def > > @@ -164,6 +164,10 @@ DEF_SVE_FUNCTION (svwhilegt, compare_scalar, while, > > none) > > DEF_SVE_FUNCTION (svwhilerw, compare_ptr, all_data, none) > > DEF_SVE_FUNCTION (svwhilewr, compare_ptr, all_data, none) > > DEF_SVE_FUNCTION (svxar, ternary_shift_right_imm, all_integer, none) > > +DEF_SVE_FUNCTION (svluti2_lane, luti2, bhs_data, none) > > +DEF_SVE_FUNCTION (svluti4_lane, luti4, bhs_data, none) > > +DEF_SVE_FUNCTION_GS (svluti4_lane, luti4, bhs_data, x2, none) > > bhs_data looks wrong: there should be no .s versions. Similarly... This also needs gating; I think the correct condition is to prefix the new intrinsics with #undef REQUIRED_EXTENSIONS #define REQUIRED_EXTENSIONS \ sve_and_sme (AARCH64_FL_SVE2 | AARCH64_FL_LUT, \ AARCH64_FL_SME2 | AARCH64_FL_LUT) > > + > > #undef REQUIRED_EXTENSIONS > > > > #define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE2) > > [...] > > diff --git a/gcc/config/aarch64/aarch64-sve2.md > > b/gcc/config/aarch64/aarch64-sve2.md > > index f8cfe08f4c0..7dcbc0700da 100644 > > --- a/gcc/config/aarch64/aarch64-sve2.md > > +++ b/gcc/config/aarch64/aarch64-sve2.md > > @@ -133,6 +133,7 @@ > > ;; ---- Optional AES extensions > > ;; ---- Optional SHA-3 extensions > > ;; ---- Optional SM4 extensions > > +;; ---- Table lookup > > > > ;; > > ========================================================================= > > ;; == Moves > > @@ -4211,3 +4212,47 @@ > > "sm4ekey\t%0.s, %1.s, %2.s" > > [(set_attr "type" "crypto_sm4")] > > ) > > + > > +;; > > ------------------------------------------------------------------------- > > +;; ---- Table lookup > > +;; > > ------------------------------------------------------------------------- > > +;; Includes: > > +;; - LUTI2 > > +;; - LUTI4 > > +;; > > ------------------------------------------------------------------------- > > + > > +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>" > > + [(set (match_operand:SVE_FULL_BS 0 "register_operand" "=w") > > + (unspec:SVE_FULL_BS > > + [(match_operand:SVE_FULL_BS 1 "register_operand" "w") > > + (match_operand:VNx16QI 2 "register_operand" "w") > > + (match_operand:DI 3 "const_int_operand") > > + (const_int LUTI_BITS)] > > + UNSPEC_SVE_LUTI))] > > + "TARGET_SVE2" > > + "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]" > > +) Similarly, these need to be gated on: "TARGET_LUT && TARGET_SVE2_OR_SME2" I've just realised that there's a similar issue with the FAMINMAX instructions in the same file; would you be able to send a separate patch fixing those? Thanks! > > + > > +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>" > > + [(set (match_operand:<VSINGLE> 0 "register_operand" "=w") > > + (unspec:<VSINGLE> > > + [(match_operand:SVE_FULL_H 1 "aligned_register_operand" "w") > > + (match_operand:VNx16QI 2 "register_operand" "w") > > + (match_operand:DI 3 "const_int_operand") > > + (const_int LUTI_BITS)] > > + UNSPEC_SVE_LUTI))] > > + "TARGET_SVE2" > > + "luti<LUTI_BITS>\t%0.<Vetype>, { %1.<Vetype> }, %2[%3]" > > +) > > ...there should be .S (VNx4) variants here. Also, the .H variants > don't require an aligned register operand. (FWIW, using "w" with > "aligned_register_operand" is wrong in any case, since "w" accepts > unaligned registers.) > > It looks like we could merge the patterns above into a single SVE_FULL_BH > pattern. > > > + > > +(define_insn "@aarch64_sve_luti<LUTI_BITS><mode>" > > + [(set (match_operand:<VSINGLE> 0 "register_operand" "=w") > > + (unspec:<VSINGLE> > > + [(match_operand:SVE_FULL_Hx2 1 "aligned_register_operand" "Uw2") > > This operand also isn't required to be aligned: Zn has a 5-bit encoding. > > > + (match_operand:VNx16QI 2 "register_operand" "w") > > + (match_operand:DI 3 "const_int_operand") > > + (const_int LUTI_BITS)] > > + UNSPEC_SVE_LUTI))] > > + "TARGET_SVE2" > > + "luti<LUTI_BITS>\t%0.<Vetype>, %1, %2[%3]" > > +) > > Also, formatting nit, but: it's more usual to indent the "[" in an > unspec by 2 or 1 extra columns, rather than a full tab. > > > [...] > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c > > b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c > > new file mode 100644 > > index 00000000000..f423bfae2c6 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/luti2_bf16.c > > @@ -0,0 +1,40 @@ > > +/* { dg-do compile } */ > > This would be better as: > > /* { dg-do assemble { target aarch64_asm_lut_ok } } */ > /* { dg-do compile { target { ! aarch64_asm_lut_ok } } } */ > > with lut added to: > > foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod" "sve" > "i8mm" "f32mm" "f64mm" "bf16" "sb" "sve2" "ls64" > "sme" "sme-i16i64" "sme2" "sve-b16b16" > "sme-b16b16" "sme-f16f16" "sme2p1" "fp8" "fp8fma" > "ssve-fp8fma" "fp8dot2" "ssve-fp8dot2" "fp8dot4" > "ssve-fp8dot4"} { > eval [string map [list FUNC $aarch64_ext] { > proc check_effective_target_aarch64_asm_FUNC_ok { } { > if { [istarget aarch64*-*-*] } { > return [check_no_compiler_messages aarch64_FUNC_assembler > object { > __asm__ (".arch_extension FUNC"); > } "-march=armv8-a+FUNC"] > } else { > return 0 > } > } > }] > } > > in target-supports.exp. > > > +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ > > + > > +#include "test_sve_acle.h" > > + > > +#pragma GCC target "+sve2+lut" > > +#if STREAMING_COMPATIBLE > > +#pragma GCC target "+sme2" > > +#endif > > + > > +/* > > +** luti2_test_imm0: > > +** luti2 z1\.h, \{ z28\.h \}, z0\[0\] > > +** ret > > +*/ > > + > > +TEST_XN_SINGLE (luti2_test_imm0, svbfloat16_t, svuint8_t, z1, > > + svluti2_lane_bf16 (z28, z0, 0), > > + svluti2_lane_bf16 (z28, z0, 0)) > > Following on from the comment above about these intrinsics being > overloaded: the second call above should not have a type suffix. > Similarly for the other tests (sorry!). > > > + > > +/* > > +** luti2_test_imm1: > > +** luti2 z1\.h, \{ z28\.h \}, z0\[1\] > > +** ret > > +*/ > > + > > +TEST_XN_SINGLE (luti2_test_imm1, svbfloat16_t, svuint8_t, z1, > > + svluti2_lane_bf16 (z28, z0, 1), > > + svluti2_lane_bf16 (z28, z0, 1)) > > + > > +/* > > +** luti2_test_tied: > > +** luti2 z28\.h, \{ z28\.h \}, z0\[2\] > > +** ret > > +*/ > > + > > +TEST_XN_SINGLE (luti2_test_tied, svbfloat16_t, svuint8_t, z28, > > + svluti2_lane_bf16 (z28, z0, 2), > > + svluti2_lane_bf16 (z28, z0, 2)) > > I think we should test the upper bound of each range. A combination of > testing the upper bound + the dg-do change would have caught the ? 1 : 7 > thing above. > > > [...] > > diff --git a/gcc/testsuite/lib/target-supports.exp > > b/gcc/testsuite/lib/target-supports.exp > > index 45ba2f47a9d..e0d9867801c 100644 > > --- a/gcc/testsuite/lib/target-supports.exp > > +++ b/gcc/testsuite/lib/target-supports.exp > > @@ -4800,6 +4800,18 @@ proc check_effective_target_aarch64_sve2 { } { > > }] > > } > > > > +# Return 1 if this is an AArch64 target supporting LUT (Lookup table) > > +proc check_effective_target_aarch64_lut { } { > > + if { ![istarget aarch64*-*-*] || > > ![check_effective_target_aarch64_sve2] } { > > + return 0 > > + } > > + return [check_no_compiler_messages aarch64_lut assembly { > > + #if !defined (__ARM_FEATURE_LUT) > > + #error FOO > > + #endif > > + }] > > +} > > + > > # Return 1 if this is an AArch64 target only supporting SVE (not SVE2). > > proc check_effective_target_aarch64_sve1_only { } { > > return [expr { [check_effective_target_aarch64_sve] > > I'm not sure this is needed. > > Thanks, > Richard