On Tue, 29 Apr 2025, Richard Sandiford wrote: > Pengfei Li <pengfei....@arm.com> writes: > > This patch implements the folding of a vector addition followed by a > > logical shift right by 1 (add + lsr #1) on AArch64 into an unsigned > > halving add, allowing GCC to emit NEON or SVE2 UHADD instructions. > > > > For example, this patch helps improve the codegen from: > > add v0.4s, v0.4s, v31.4s > > ushr v0.4s, v0.4s, 1 > > to: > > uhadd v0.4s, v0.4s, v31.4s > > > > For NEON, vector operations are represented using generic mid-end > > operations, so new folding rules are added to match.pd. For SVE2, the > > operations are represented using built-in GIMPLE calls, so this > > optimization is implemented via gimple_folder. > > > > To ensure correctness, additional checks are introduced to guargntee > > that the operands to UHADD are vectors in which each element has its top > > bit cleared. > > > > This patch has been bootstrapped and regression tested on > > x86_64-linux-gnu and aarch64-linux-gnu. > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64-sve-builtins-base.cc (find_sve_builtin_call): > > New helper function for finding and checking a GIMPLE call. > > (is_undef): Rewrite with find_sve_builtin_call. > > (class svlsr_impl): Implement the folding for SVE2. > > (FUNCTION): Check and fold the pattern. > > * match.pd: Add new rules to implement the folding for NEON. > > * tree.cc (top_bit_zero_vector_p): Add a new utility function for > > vector top bit zero check. > > * tree.h (top_bit_zero_vector_p): Add a function declaration. > > The target-independent changes are out of my comfort area. > Cc:ing Richi for those. > > But rather than top_bit_zero_vector_p, how about a more general > nonzero_element_bits? I've wanted something similar in the past.
IMO a general nonzero_element_bits, either as a zero bits mask of vector width or ANDed/IORed across elements should be provided by {set/get}_nonzero_bits/ranger being extended to cover [integer] vectors. > I don't think we can use an unbounded recursive walk, since that > would become quadratic if we ever used it when optimising one > AND in a chain of ANDs. (And using this function for ANDs > seems plausible.) Maybe we should be handling the information > in a similar way to Ranger. Indeed, the recursion isn't good. I'd be fine adding a non-recursive (match top_bit_zero_vector_p ...) > Rather than handle the built-in case entirely in target code, how about > having a target hook into nonzero_element_bits (or whatever replaces it) > for machine-dependent builtins? I guess that's reasonable once we can make use of it, we should have a generic function handling gimple *, like we have gimple_stmt_nonnegative_warnv_p / gimple_stmt_integer_valued_real_p. Those also provide a recipie to limit recursion in case you really need that for the case in question. But for nonzero bits wiring this into ranger looks better. The semantic of a common range/mask for all elements looks "easy" to implement, since you can re-use irange/frange then and not need a new "vector range". Richard. > > Thanks, > Richard > > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/aarch64/acle/uhadd_1.c: New test. > > * gcc.target/aarch64/sve2/acle/general/uhadd_1.c: New test. > > --- > > .../aarch64/aarch64-sve-builtins-base.cc | 101 ++++++++++++++++-- > > gcc/match.pd | 7 ++ > > .../gcc.target/aarch64/acle/uhadd_1.c | 34 ++++++ > > .../aarch64/sve2/acle/general/uhadd_1.c | 30 ++++++ > > gcc/tree.cc | 30 ++++++ > > gcc/tree.h | 4 + > > 6 files changed, 199 insertions(+), 7 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c > > create mode 100644 > > gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c > > > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > index b4396837c24..ce6da82bf81 100644 > > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > @@ -43,6 +43,7 @@ > > #include "aarch64-sve-builtins.h" > > #include "aarch64-sve-builtins-shapes.h" > > #include "aarch64-sve-builtins-base.h" > > +#include "aarch64-sve-builtins-sve2.h" > > #include "aarch64-sve-builtins-functions.h" > > #include "aarch64-builtins.h" > > #include "ssa.h" > > @@ -53,6 +54,23 @@ using namespace aarch64_sve; > > > > namespace { > > > > +/* Return gcall* if VAL is an SSA_NAME defined by the given SVE intrinsics > > call. > > + Otherwise return NULL. */ > > +static gcall* > > +find_sve_builtin_call (tree val, const function_base *func) > > +{ > > + if (TREE_CODE (val) == SSA_NAME) > > + { > > + gimple *def = SSA_NAME_DEF_STMT (val); > > + if (gcall *call = dyn_cast<gcall *> (def)) > > + if (tree fndecl = gimple_call_fndecl (call)) > > + if (const function_instance *instance = lookup_fndecl (fndecl)) > > + if (instance->base == func) > > + return call; > > + } > > + return NULL; > > +} > > + > > /* Return true if VAL is an undefined value. */ > > static bool > > is_undef (tree val) > > @@ -62,12 +80,7 @@ is_undef (tree val) > > if (ssa_undefined_value_p (val, false)) > > return true; > > > > - gimple *def = SSA_NAME_DEF_STMT (val); > > - if (gcall *call = dyn_cast<gcall *> (def)) > > - if (tree fndecl = gimple_call_fndecl (call)) > > - if (const function_instance *instance = lookup_fndecl (fndecl)) > > - if (instance->base == functions::svundef) > > - return true; > > + return (find_sve_builtin_call (val, functions::svundef) != NULL); > > } > > return false; > > } > > @@ -2088,6 +2101,80 @@ public: > > } > > }; > > > > +class svlsr_impl : public rtx_code_function > > +{ > > +private: > > + /* Return true if we know active lanes for use in T have top bit zero, > > where > > + pg_use tells which lanes are active for use. */ > > + bool > > + active_lanes_top_bit_zero_p (tree t, tree pg_use) const > > + { > > + /* Return true if T itself is a vector in which each element has top > > bit > > + zero. */ > > + if (top_bit_zero_vector_p (t)) > > + return true; > > + > > + /* Return true if T is an AND op with a vector in which each element > > has > > + top bit zero. Note the predicate for AND op should cover active > > lanes > > + for use. */ > > + gcall *and_call = find_sve_builtin_call (t, functions::svand); > > + if (and_call != NULL) > > + { > > + tree pg = gimple_call_arg (and_call, 0); > > + if (pg == pg_use || is_ptrue (pg, element_precision (t) / CHAR_BIT)) > > + { > > + return top_bit_zero_vector_p (gimple_call_arg (and_call, 1)) > > + || top_bit_zero_vector_p (gimple_call_arg (and_call, 2)); > > + } > > + } > > + > > + return false; > > + } > > + > > +public: > > + CONSTEXPR svlsr_impl () > > + : rtx_code_function (LSHIFTRT, LSHIFTRT) {} > > + > > + gimple* > > + fold (gimple_folder &f) const override > > + { > > + /* Below folding applies to SVE2 only. */ > > + if (!TARGET_SVE2) > > + return NULL; > > + > > + /* Fold calls for patterns of LSR (ADD (x, y), 1) to an HADD (x, y). > > Note > > + LSR and ADD should share the same pg to fold. */ > > + tree pg = gimple_call_arg (f.call, 0); > > + tree lsr_opnd = gimple_call_arg (f.call, 1); > > + tree lsr_dist = gimple_call_arg (f.call, 2); > > + > > + gcall *add_call; > > + if ((add_call = find_sve_builtin_call (lsr_opnd, functions::svadd)) != > > NULL > > + && integer_onep (lsr_dist) > > + && gimple_call_arg (add_call, 0) == pg) > > + { > > + /* Check if we know all active lanes in the two addends of the add_call > > + have top bit zero, where pg indicates which lanes are active. */ > > + tree addend1 = gimple_call_arg (add_call, 1); > > + tree addend2 = gimple_call_arg (add_call, 2); > > + if (active_lanes_top_bit_zero_p (addend1, pg) > > + && active_lanes_top_bit_zero_p (addend2, pg)) > > + { > > + function_instance instance ("svhadd", functions::svhadd, > > + shapes::binary_opt_n, MODE_none, > > + f.type_suffix_ids, GROUP_none, f.pred, > > + FPM_unused); > > + gcall *call = f.redirect_call (instance); > > + gimple_call_set_arg (call, 1, addend1); > > + gimple_call_set_arg (call, 2, addend2); > > + return call; > > + } > > + } > > + > > + return NULL; > > + } > > +}; > > + > > class svmad_impl : public function_base > > { > > public: > > @@ -3586,7 +3673,7 @@ FUNCTION (svldnt1, svldnt1_impl,) > > FUNCTION (svlen, svlen_impl,) > > FUNCTION (svlsl, svlsl_impl,) > > FUNCTION (svlsl_wide, shift_wide, (ASHIFT, UNSPEC_ASHIFT_WIDE)) > > -FUNCTION (svlsr, rtx_code_function, (LSHIFTRT, LSHIFTRT)) > > +FUNCTION (svlsr, svlsr_impl,) > > FUNCTION (svlsr_wide, shift_wide, (LSHIFTRT, UNSPEC_LSHIFTRT_WIDE)) > > FUNCTION (svmad, svmad_impl,) > > FUNCTION (svmax, rtx_code_function, (SMAX, UMAX, UNSPEC_COND_FMAX, > > diff --git a/gcc/match.pd b/gcc/match.pd > > index 0fe90a6edc4..02f70ea78e3 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -2176,6 +2176,13 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > (view_convert (rshift (view_convert:ntype @0) @1)) > > (convert (rshift (convert:ntype @0) @1)))))) > > > > +/* Fold ((x + y) >> 1 into IFN_AVG_FLOOR (x & y), > > + if we know x and y are vectors in which each element has top bit zero. > > */ > > +(simplify > > + (rshift (plus:cs @0 @1) integer_onep) > > + (if (top_bit_zero_vector_p (@0) && top_bit_zero_vector_p (@1)) > > + (IFN_AVG_FLOOR @0 @1))) > > + > > /* Try to fold (type) X op CST -> (type) (X op ((type-x) CST)) > > when profitable. > > For bitwise binary operations apply operand conversions to the > > diff --git a/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c > > b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c > > new file mode 100644 > > index 00000000000..f1748a199ad > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c > > @@ -0,0 +1,34 @@ > > +/* Test if SIMD fused unsigned halving adds are generated */ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2" } */ > > + > > +#include <arm_neon.h> > > + > > +#define FUSED_SIMD_UHADD(vectype, q, ts, mask) \ > > + vectype simd_uhadd ## q ## _ ## ts ## _1 (vectype a) \ > > + { \ > > + vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \ > > + vectype v2 = vdup ## q ## _n_ ## ts (mask); \ > > + return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \ > > + } \ > > + \ > > + vectype simd_uhadd ## q ## _ ## ts ## _2 (vectype a, vectype b) \ > > + { \ > > + vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \ > > + vectype v2 = vand ## q ## _ ## ts (b, vdup ## q ## _n_ ## ts (mask)); \ > > + return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \ > > + } > > + > > +FUSED_SIMD_UHADD (uint8x8_t, , u8, 0x7f) > > +FUSED_SIMD_UHADD (uint8x16_t, q, u8, 0x7f) > > +FUSED_SIMD_UHADD (uint16x4_t, , u16, 0x7fff) > > +FUSED_SIMD_UHADD (uint16x8_t, q, u16, 0x7fff) > > +FUSED_SIMD_UHADD (uint32x2_t, , u32, 0x7fffffff) > > +FUSED_SIMD_UHADD (uint32x4_t, q, u32, 0x7fffffff) > > + > > +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8b,} 2 } } */ > > +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.16b,} 2 } } */ > > +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4h,} 2 } } */ > > +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8h,} 2 } } */ > > +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.2s,} 2 } } */ > > +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4s,} 2 } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c > > b/gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c > > new file mode 100644 > > index 00000000000..9a219eb5086 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c > > @@ -0,0 +1,30 @@ > > +/* Test if SVE2 fused unsigned halving adds are generated */ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2" } */ > > + > > +#include <arm_sve.h> > > + > > +#define FUSED_SVE2_UHADD(vectype, ts, tspg, mask) \ > > + vectype sve2_uhadd ## _ ## ts ## _1 (svbool_t pg, vectype a) \ > > + { \ > > + vectype v1 = svdup_ ## ts (mask); \ > > + vectype v2 = svand_m (svptrue_ ## tspg (), a, svdup_ ## ts (mask)); \ > > + return svlsr_x(pg, svadd_x (pg, v1, v2), svdup_ ## ts (1)); \ > > + } \ > > + \ > > + vectype sve2_uhadd ## _ ## ts ## _2 (svbool_t pg, vectype a, vectype b) \ > > + { \ > > + vectype v1 = svand_m (pg, a, svdup_ ## ts (mask)); \ > > + vectype v2 = svand_m (pg, b, svdup_ ## ts (mask)); \ > > + return svlsr_m(pg, svadd_m (pg, v1, v2), svdup_ ## ts (1)); \ > > + } > > + > > +FUSED_SVE2_UHADD (svuint8_t, u8, b8, 0x7f); > > +FUSED_SVE2_UHADD (svuint16_t, u16, b16, 0x7fff); > > +FUSED_SVE2_UHADD (svuint32_t, u32, b32, 0x7fffffff); > > +FUSED_SVE2_UHADD (svuint64_t, u64, b64, 0x7fffffffffffffff); > > + > > +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.b, p[0-7]/m,} 2 } > > } */ > > +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.h, p[0-7]/m,} 2 } > > } */ > > +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.s, p[0-7]/m,} 2 } > > } */ > > +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.d, p[0-7]/m,} 2 } > > } */ > > diff --git a/gcc/tree.cc b/gcc/tree.cc > > index eccfcc89da4..bdee2a93a44 100644 > > --- a/gcc/tree.cc > > +++ b/gcc/tree.cc > > @@ -10756,6 +10756,36 @@ uniform_integer_cst_p (tree t) > > return NULL_TREE; > > } > > > > +/* Checks to see if T is a vector in which each element has top bit zero > > then > > + return T otherwise NULL_TREE. */ > > + > > +tree > > +top_bit_zero_vector_p (tree t) > > +{ > > + if (!VECTOR_TYPE_P (TREE_TYPE (t))) > > + return NULL_TREE; > > + > > + tree elem = uniform_vector_p (t); > > + if (tree_fits_uhwi_p (elem)) > > + { > > + unsigned int prec = element_precision (t); > > + if ((tree_to_uhwi (elem) & (HOST_WIDE_INT_1U << (prec - 1))) == 0) > > + return t; > > + } > > + > > + if (TREE_CODE (t) == SSA_NAME) > > + { > > + gimple *def = SSA_NAME_DEF_STMT (t); > > + if (is_gimple_assign (def) > > + && gimple_assign_rhs_code (def) == BIT_AND_EXPR > > + && (top_bit_zero_vector_p (gimple_assign_rhs1 (def)) != NULL_TREE > > + || top_bit_zero_vector_p (gimple_assign_rhs2 (def)) != NULL_TREE)) > > + return t; > > + } > > + > > + return NULL_TREE; > > +} > > + > > /* Checks to see if T is a constant or a constant vector and if each > > element E > > adheres to ~E + 1 == pow2 then return ~E otherwise NULL_TREE. */ > > > > diff --git a/gcc/tree.h b/gcc/tree.h > > index 99f26177628..6dfbbdc1aea 100644 > > --- a/gcc/tree.h > > +++ b/gcc/tree.h > > @@ -5249,6 +5249,10 @@ extern tree uniform_vector_p (const_tree); > > > > extern tree uniform_integer_cst_p (tree); > > > > +/* Checks to see if T is a vector in which each element has top bit zero > > then > > + return T otherwise NULL_TREE. */ > > +extern tree top_bit_zero_vector_p (tree t); > > + > > extern int single_nonzero_element (const_tree); > > > > /* Given a CONSTRUCTOR CTOR, return the element values as a vector. */ > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)