This patch implements the folding of a vector addition followed by a logical shift right by 1 (add + lsr #1) on AArch64 into an unsigned halving add, allowing GCC to emit NEON or SVE2 UHADD instructions.
For example, this patch helps improve the codegen from: add v0.4s, v0.4s, v31.4s ushr v0.4s, v0.4s, 1 to: uhadd v0.4s, v0.4s, v31.4s For NEON, vector operations are represented using generic mid-end operations, so new folding rules are added to match.pd. For SVE2, the operations are represented using built-in GIMPLE calls, so this optimization is implemented via gimple_folder. To ensure correctness, additional checks are introduced to guargntee that the operands to UHADD are vectors in which each element has its top bit cleared. This patch has been bootstrapped and regression tested on x86_64-linux-gnu and aarch64-linux-gnu. gcc/ChangeLog: * config/aarch64/aarch64-sve-builtins-base.cc (find_sve_builtin_call): New helper function for finding and checking a GIMPLE call. (is_undef): Rewrite with find_sve_builtin_call. (class svlsr_impl): Implement the folding for SVE2. (FUNCTION): Check and fold the pattern. * match.pd: Add new rules to implement the folding for NEON. * tree.cc (top_bit_zero_vector_p): Add a new utility function for vector top bit zero check. * tree.h (top_bit_zero_vector_p): Add a function declaration. gcc/testsuite/ChangeLog: * gcc.target/aarch64/acle/uhadd_1.c: New test. * gcc.target/aarch64/sve2/acle/general/uhadd_1.c: New test. --- .../aarch64/aarch64-sve-builtins-base.cc | 101 ++++++++++++++++-- gcc/match.pd | 7 ++ .../gcc.target/aarch64/acle/uhadd_1.c | 34 ++++++ .../aarch64/sve2/acle/general/uhadd_1.c | 30 ++++++ gcc/tree.cc | 30 ++++++ gcc/tree.h | 4 + 6 files changed, 199 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index b4396837c24..ce6da82bf81 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -43,6 +43,7 @@ #include "aarch64-sve-builtins.h" #include "aarch64-sve-builtins-shapes.h" #include "aarch64-sve-builtins-base.h" +#include "aarch64-sve-builtins-sve2.h" #include "aarch64-sve-builtins-functions.h" #include "aarch64-builtins.h" #include "ssa.h" @@ -53,6 +54,23 @@ using namespace aarch64_sve; namespace { +/* Return gcall* if VAL is an SSA_NAME defined by the given SVE intrinsics call. + Otherwise return NULL. */ +static gcall* +find_sve_builtin_call (tree val, const function_base *func) +{ + if (TREE_CODE (val) == SSA_NAME) + { + gimple *def = SSA_NAME_DEF_STMT (val); + if (gcall *call = dyn_cast<gcall *> (def)) + if (tree fndecl = gimple_call_fndecl (call)) + if (const function_instance *instance = lookup_fndecl (fndecl)) + if (instance->base == func) + return call; + } + return NULL; +} + /* Return true if VAL is an undefined value. */ static bool is_undef (tree val) @@ -62,12 +80,7 @@ is_undef (tree val) if (ssa_undefined_value_p (val, false)) return true; - gimple *def = SSA_NAME_DEF_STMT (val); - if (gcall *call = dyn_cast<gcall *> (def)) - if (tree fndecl = gimple_call_fndecl (call)) - if (const function_instance *instance = lookup_fndecl (fndecl)) - if (instance->base == functions::svundef) - return true; + return (find_sve_builtin_call (val, functions::svundef) != NULL); } return false; } @@ -2088,6 +2101,80 @@ public: } }; +class svlsr_impl : public rtx_code_function +{ +private: + /* Return true if we know active lanes for use in T have top bit zero, where + pg_use tells which lanes are active for use. */ + bool + active_lanes_top_bit_zero_p (tree t, tree pg_use) const + { + /* Return true if T itself is a vector in which each element has top bit + zero. */ + if (top_bit_zero_vector_p (t)) + return true; + + /* Return true if T is an AND op with a vector in which each element has + top bit zero. Note the predicate for AND op should cover active lanes + for use. */ + gcall *and_call = find_sve_builtin_call (t, functions::svand); + if (and_call != NULL) + { + tree pg = gimple_call_arg (and_call, 0); + if (pg == pg_use || is_ptrue (pg, element_precision (t) / CHAR_BIT)) + { + return top_bit_zero_vector_p (gimple_call_arg (and_call, 1)) + || top_bit_zero_vector_p (gimple_call_arg (and_call, 2)); + } + } + + return false; + } + +public: + CONSTEXPR svlsr_impl () + : rtx_code_function (LSHIFTRT, LSHIFTRT) {} + + gimple* + fold (gimple_folder &f) const override + { + /* Below folding applies to SVE2 only. */ + if (!TARGET_SVE2) + return NULL; + + /* Fold calls for patterns of LSR (ADD (x, y), 1) to an HADD (x, y). Note + LSR and ADD should share the same pg to fold. */ + tree pg = gimple_call_arg (f.call, 0); + tree lsr_opnd = gimple_call_arg (f.call, 1); + tree lsr_dist = gimple_call_arg (f.call, 2); + + gcall *add_call; + if ((add_call = find_sve_builtin_call (lsr_opnd, functions::svadd)) != NULL + && integer_onep (lsr_dist) + && gimple_call_arg (add_call, 0) == pg) + { + /* Check if we know all active lanes in the two addends of the add_call + have top bit zero, where pg indicates which lanes are active. */ + tree addend1 = gimple_call_arg (add_call, 1); + tree addend2 = gimple_call_arg (add_call, 2); + if (active_lanes_top_bit_zero_p (addend1, pg) + && active_lanes_top_bit_zero_p (addend2, pg)) + { + function_instance instance ("svhadd", functions::svhadd, + shapes::binary_opt_n, MODE_none, + f.type_suffix_ids, GROUP_none, f.pred, + FPM_unused); + gcall *call = f.redirect_call (instance); + gimple_call_set_arg (call, 1, addend1); + gimple_call_set_arg (call, 2, addend2); + return call; + } + } + + return NULL; + } +}; + class svmad_impl : public function_base { public: @@ -3586,7 +3673,7 @@ FUNCTION (svldnt1, svldnt1_impl,) FUNCTION (svlen, svlen_impl,) FUNCTION (svlsl, svlsl_impl,) FUNCTION (svlsl_wide, shift_wide, (ASHIFT, UNSPEC_ASHIFT_WIDE)) -FUNCTION (svlsr, rtx_code_function, (LSHIFTRT, LSHIFTRT)) +FUNCTION (svlsr, svlsr_impl,) FUNCTION (svlsr_wide, shift_wide, (LSHIFTRT, UNSPEC_LSHIFTRT_WIDE)) FUNCTION (svmad, svmad_impl,) FUNCTION (svmax, rtx_code_function, (SMAX, UMAX, UNSPEC_COND_FMAX, diff --git a/gcc/match.pd b/gcc/match.pd index 0fe90a6edc4..02f70ea78e3 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -2176,6 +2176,13 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (view_convert (rshift (view_convert:ntype @0) @1)) (convert (rshift (convert:ntype @0) @1)))))) +/* Fold ((x + y) >> 1 into IFN_AVG_FLOOR (x & y), + if we know x and y are vectors in which each element has top bit zero. */ +(simplify + (rshift (plus:cs @0 @1) integer_onep) + (if (top_bit_zero_vector_p (@0) && top_bit_zero_vector_p (@1)) + (IFN_AVG_FLOOR @0 @1))) + /* Try to fold (type) X op CST -> (type) (X op ((type-x) CST)) when profitable. For bitwise binary operations apply operand conversions to the diff --git a/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c new file mode 100644 index 00000000000..f1748a199ad --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/uhadd_1.c @@ -0,0 +1,34 @@ +/* Test if SIMD fused unsigned halving adds are generated */ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include <arm_neon.h> + +#define FUSED_SIMD_UHADD(vectype, q, ts, mask) \ + vectype simd_uhadd ## q ## _ ## ts ## _1 (vectype a) \ + { \ + vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \ + vectype v2 = vdup ## q ## _n_ ## ts (mask); \ + return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \ + } \ + \ + vectype simd_uhadd ## q ## _ ## ts ## _2 (vectype a, vectype b) \ + { \ + vectype v1 = vand ## q ## _ ## ts (a, vdup ## q ## _n_ ## ts (mask)); \ + vectype v2 = vand ## q ## _ ## ts (b, vdup ## q ## _n_ ## ts (mask)); \ + return vshr ## q ## _n_ ## ts (vadd ## q ## _ ## ts (v1, v2), 1); \ + } + +FUSED_SIMD_UHADD (uint8x8_t, , u8, 0x7f) +FUSED_SIMD_UHADD (uint8x16_t, q, u8, 0x7f) +FUSED_SIMD_UHADD (uint16x4_t, , u16, 0x7fff) +FUSED_SIMD_UHADD (uint16x8_t, q, u16, 0x7fff) +FUSED_SIMD_UHADD (uint32x2_t, , u32, 0x7fffffff) +FUSED_SIMD_UHADD (uint32x4_t, q, u32, 0x7fffffff) + +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8b,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.16b,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4h,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.8h,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.2s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tv[0-9]+\.4s,} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c new file mode 100644 index 00000000000..9a219eb5086 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/general/uhadd_1.c @@ -0,0 +1,30 @@ +/* Test if SVE2 fused unsigned halving adds are generated */ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include <arm_sve.h> + +#define FUSED_SVE2_UHADD(vectype, ts, tspg, mask) \ + vectype sve2_uhadd ## _ ## ts ## _1 (svbool_t pg, vectype a) \ + { \ + vectype v1 = svdup_ ## ts (mask); \ + vectype v2 = svand_m (svptrue_ ## tspg (), a, svdup_ ## ts (mask)); \ + return svlsr_x(pg, svadd_x (pg, v1, v2), svdup_ ## ts (1)); \ + } \ + \ + vectype sve2_uhadd ## _ ## ts ## _2 (svbool_t pg, vectype a, vectype b) \ + { \ + vectype v1 = svand_m (pg, a, svdup_ ## ts (mask)); \ + vectype v2 = svand_m (pg, b, svdup_ ## ts (mask)); \ + return svlsr_m(pg, svadd_m (pg, v1, v2), svdup_ ## ts (1)); \ + } + +FUSED_SVE2_UHADD (svuint8_t, u8, b8, 0x7f); +FUSED_SVE2_UHADD (svuint16_t, u16, b16, 0x7fff); +FUSED_SVE2_UHADD (svuint32_t, u32, b32, 0x7fffffff); +FUSED_SVE2_UHADD (svuint64_t, u64, b64, 0x7fffffffffffffff); + +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.b, p[0-7]/m,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.h, p[0-7]/m,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.s, p[0-7]/m,} 2 } } */ +/* { dg-final { scan-assembler-times {\tuhadd\tz[0-9]+\.d, p[0-7]/m,} 2 } } */ diff --git a/gcc/tree.cc b/gcc/tree.cc index eccfcc89da4..bdee2a93a44 100644 --- a/gcc/tree.cc +++ b/gcc/tree.cc @@ -10756,6 +10756,36 @@ uniform_integer_cst_p (tree t) return NULL_TREE; } +/* Checks to see if T is a vector in which each element has top bit zero then + return T otherwise NULL_TREE. */ + +tree +top_bit_zero_vector_p (tree t) +{ + if (!VECTOR_TYPE_P (TREE_TYPE (t))) + return NULL_TREE; + + tree elem = uniform_vector_p (t); + if (tree_fits_uhwi_p (elem)) + { + unsigned int prec = element_precision (t); + if ((tree_to_uhwi (elem) & (HOST_WIDE_INT_1U << (prec - 1))) == 0) + return t; + } + + if (TREE_CODE (t) == SSA_NAME) + { + gimple *def = SSA_NAME_DEF_STMT (t); + if (is_gimple_assign (def) + && gimple_assign_rhs_code (def) == BIT_AND_EXPR + && (top_bit_zero_vector_p (gimple_assign_rhs1 (def)) != NULL_TREE + || top_bit_zero_vector_p (gimple_assign_rhs2 (def)) != NULL_TREE)) + return t; + } + + return NULL_TREE; +} + /* Checks to see if T is a constant or a constant vector and if each element E adheres to ~E + 1 == pow2 then return ~E otherwise NULL_TREE. */ diff --git a/gcc/tree.h b/gcc/tree.h index 99f26177628..6dfbbdc1aea 100644 --- a/gcc/tree.h +++ b/gcc/tree.h @@ -5249,6 +5249,10 @@ extern tree uniform_vector_p (const_tree); extern tree uniform_integer_cst_p (tree); +/* Checks to see if T is a vector in which each element has top bit zero then + return T otherwise NULL_TREE. */ +extern tree top_bit_zero_vector_p (tree t); + extern int single_nonzero_element (const_tree); /* Given a CONSTRUCTOR CTOR, return the element values as a vector. */ -- 2.43.0