Jennifer Schmitz <jschm...@nvidia.com> writes: > This patch folds signed SVE division where all divisor elements are the same > power of 2 to svasrd. Tests were added to check 1) whether the transform is > applied, i.e. asrd is used, and 2) correctness for all possible input types > for svdiv, predication, and a variety of values. As the transform is applied > only to signed integers, correctness for predication and values was only > tested for svint32_t and svint64_t. > Existing svdiv tests were adjusted such that the divisor is no longer a > power of 2. > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. > OK for mainline? > > Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> > > gcc/ > > * config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl): Implement > fold and expand. > > gcc/testsuite/ > > * gcc.target/aarch64/sve/div_const_1.c: New test. > * gcc.target/aarch64/sve/div_const_1_run.c: Likewise. > * gcc.target/aarch64/sve/acle/asm/div_s32.c: Adjust expected output. > * gcc.target/aarch64/sve/acle/asm/div_s64.c: Likewise. > > From e8ffbab52ad7b9307cbfc9dbca4ef4d20e08804b Mon Sep 17 00:00:00 2001 > From: Jennifer Schmitz <jschm...@nvidia.com> > Date: Tue, 16 Jul 2024 01:59:50 -0700 > Subject: [PATCH 1/2] SVE intrinsics: Add strength reduction for division by > constant. > > This patch folds signed SVE division where all divisor elements are the same > power of 2 to svasrd. Tests were added to check 1) whether the transform is > applied, i.e. asrd is used, and 2) correctness for all possible input types > for svdiv, predication, and a variety of values. As the transform is applied > only to signed integers, correctness for predication and values was only > tested for svint32_t and svint64_t. > Existing svdiv tests were adjusted such that the divisor is no longer a > power of 2. > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. > OK for mainline? > > Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> > > gcc/ > > * config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl): Implement > fold and expand. > > gcc/testsuite/ > > * gcc.target/aarch64/sve/div_const_1.c: New test. > * gcc.target/aarch64/sve/div_const_1_run.c: Likewise. > * gcc.target/aarch64/sve/acle/asm/div_s32.c: Adjust expected output. > * gcc.target/aarch64/sve/acle/asm/div_s64.c: Likewise. > --- > .../aarch64/aarch64-sve-builtins-base.cc | 44 ++++++++- > .../gcc.target/aarch64/sve/acle/asm/div_s32.c | 60 ++++++------ > .../gcc.target/aarch64/sve/acle/asm/div_s64.c | 60 ++++++------ > .../gcc.target/aarch64/sve/div_const_1.c | 34 +++++++ > .../gcc.target/aarch64/sve/div_const_1_run.c | 91 +++++++++++++++++++ > 5 files changed, 228 insertions(+), 61 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/div_const_1.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/div_const_1_run.c > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > index aa26370d397..d821cc96588 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > @@ -746,6 +746,48 @@ public: > } > }; > > +class svdiv_impl : public unspec_based_function > +{ > +public: > + CONSTEXPR svdiv_impl () > + : unspec_based_function (DIV, UDIV, UNSPEC_COND_FDIV) {} > + > + gimple * > + fold (gimple_folder &f) const override > + { > + tree divisor = gimple_call_arg (f.call, 2); > + tree divisor_cst = uniform_integer_cst_p (divisor); > + > + if (f.type_suffix (0).unsigned_p) > + { > + return NULL; > + }
We might as well test this first, since it doesn't depend on the divisor_cst result. Formatting nit: should be no braces for single statements, so: if (f.type_suffix (0).unsigned_p) return NULL; Same for the others. > + > + if (!divisor_cst) > + { > + return NULL; > + } > + > + if (!integer_pow2p (divisor_cst)) > + { > + return NULL; > + } > + > + function_instance instance ("svasrd", functions::svasrd, > shapes::shift_right_imm, MODE_n, f.type_suffix_ids, GROUP_none, f.pred); This line is above the 80 character limit. Maybe: function_instance instance ("svasrd", functions::svasrd, shapes::shift_right_imm, MODE_n, f.type_suffix_ids, GROUP_none, f.pred); > + gcall *call = as_a <gcall *> (f.redirect_call (instance)); Looks like an oversight that redirect_call doesn't return a gcall directly. IMO it'd better to fix that instead. > + tree shift_amt = wide_int_to_tree (TREE_TYPE (divisor_cst), tree_log2 > (divisor_cst)); This ought to have type uint64_t instead, to match the function prototype. That can be had from scalar_types[VECTOR_TYPE_svuint64_t]. > + gimple_call_set_arg (call, 2, shift_amt); > + return call; > + } > + > + rtx > + expand (function_expander &e) const override > + { > + return e.map_to_rtx_codes (DIV, UDIV, UNSPEC_COND_FDIV, -1, > DEFAULT_MERGE_ARGNO); > + } This shouldn't be necessary, given the inheritance from unspec_based_function. > +}; > + > + > class svdot_impl : public function_base > { > public: > @@ -3043,7 +3085,7 @@ FUNCTION (svcreate3, svcreate_impl, (3)) > FUNCTION (svcreate4, svcreate_impl, (4)) > FUNCTION (svcvt, svcvt_impl,) > FUNCTION (svcvtnt, CODE_FOR_MODE0 (aarch64_sve_cvtnt),) > -FUNCTION (svdiv, rtx_code_function, (DIV, UDIV, UNSPEC_COND_FDIV)) > +FUNCTION (svdiv, svdiv_impl,) > FUNCTION (svdivr, rtx_code_function_rotated, (DIV, UDIV, UNSPEC_COND_FDIV)) > FUNCTION (svdot, svdot_impl,) > FUNCTION (svdot_lane, svdotprod_lane_impl, (UNSPEC_SDOT, UNSPEC_UDOT, > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c > index c49ca1aa524..da2fe7c5451 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c > @@ -54,25 +54,25 @@ TEST_UNIFORM_ZX (div_w0_s32_m_untied, svint32_t, int32_t, > z0 = svdiv_m (p0, z1, x0)) > > /* > -** div_2_s32_m_tied1: > -** mov (z[0-9]+\.s), #2 > +** div_3_s32_m_tied1: > +** mov (z[0-9]+\.s), #3 > ** sdiv z0\.s, p0/m, z0\.s, \1 > ** ret > */ > -TEST_UNIFORM_Z (div_2_s32_m_tied1, svint32_t, > - z0 = svdiv_n_s32_m (p0, z0, 2), > - z0 = svdiv_m (p0, z0, 2)) > +TEST_UNIFORM_Z (div_3_s32_m_tied1, svint32_t, > + z0 = svdiv_n_s32_m (p0, z0, 3), > + z0 = svdiv_m (p0, z0, 3)) I think we should test both 2 and 3, using this harness to make sure that svdiv of 2 does become svasrd. (Especially since the new test is specific to fixed-length vectors.) It would be good to test the limits too: 1 and 1<<30. Presumably 0b1000... (-1<<31) shouldn't be optimised, so we should test that too. Same idea (with adjusted limits) for s64. Thanks, Richard > > /* > -** div_2_s32_m_untied: > -** mov (z[0-9]+\.s), #2 > +** div_3_s32_m_untied: > +** mov (z[0-9]+\.s), #3 > ** movprfx z0, z1 > ** sdiv z0\.s, p0/m, z0\.s, \1 > ** ret > */ > -TEST_UNIFORM_Z (div_2_s32_m_untied, svint32_t, > - z0 = svdiv_n_s32_m (p0, z1, 2), > - z0 = svdiv_m (p0, z1, 2)) > +TEST_UNIFORM_Z (div_3_s32_m_untied, svint32_t, > + z0 = svdiv_n_s32_m (p0, z1, 3), > + z0 = svdiv_m (p0, z1, 3)) > > /* > ** div_s32_z_tied1: > @@ -137,19 +137,19 @@ TEST_UNIFORM_ZX (div_w0_s32_z_untied, svint32_t, > int32_t, > z0 = svdiv_z (p0, z1, x0)) > > /* > -** div_2_s32_z_tied1: > -** mov (z[0-9]+\.s), #2 > +** div_3_s32_z_tied1: > +** mov (z[0-9]+\.s), #3 > ** movprfx z0\.s, p0/z, z0\.s > ** sdiv z0\.s, p0/m, z0\.s, \1 > ** ret > */ > -TEST_UNIFORM_Z (div_2_s32_z_tied1, svint32_t, > - z0 = svdiv_n_s32_z (p0, z0, 2), > - z0 = svdiv_z (p0, z0, 2)) > +TEST_UNIFORM_Z (div_3_s32_z_tied1, svint32_t, > + z0 = svdiv_n_s32_z (p0, z0, 3), > + z0 = svdiv_z (p0, z0, 3)) > > /* > -** div_2_s32_z_untied: > -** mov (z[0-9]+\.s), #2 > +** div_3_s32_z_untied: > +** mov (z[0-9]+\.s), #3 > ** ( > ** movprfx z0\.s, p0/z, z1\.s > ** sdiv z0\.s, p0/m, z0\.s, \1 > @@ -159,9 +159,9 @@ TEST_UNIFORM_Z (div_2_s32_z_tied1, svint32_t, > ** ) > ** ret > */ > -TEST_UNIFORM_Z (div_2_s32_z_untied, svint32_t, > - z0 = svdiv_n_s32_z (p0, z1, 2), > - z0 = svdiv_z (p0, z1, 2)) > +TEST_UNIFORM_Z (div_3_s32_z_untied, svint32_t, > + z0 = svdiv_n_s32_z (p0, z1, 3), > + z0 = svdiv_z (p0, z1, 3)) > > /* > ** div_s32_x_tied1: > @@ -217,21 +217,21 @@ TEST_UNIFORM_ZX (div_w0_s32_x_untied, svint32_t, > int32_t, > z0 = svdiv_x (p0, z1, x0)) > > /* > -** div_2_s32_x_tied1: > -** mov (z[0-9]+\.s), #2 > +** div_3_s32_x_tied1: > +** mov (z[0-9]+\.s), #3 > ** sdiv z0\.s, p0/m, z0\.s, \1 > ** ret > */ > -TEST_UNIFORM_Z (div_2_s32_x_tied1, svint32_t, > - z0 = svdiv_n_s32_x (p0, z0, 2), > - z0 = svdiv_x (p0, z0, 2)) > +TEST_UNIFORM_Z (div_3_s32_x_tied1, svint32_t, > + z0 = svdiv_n_s32_x (p0, z0, 3), > + z0 = svdiv_x (p0, z0, 3)) > > /* > -** div_2_s32_x_untied: > -** mov z0\.s, #2 > +** div_3_s32_x_untied: > +** mov z0\.s, #3 > ** sdivr z0\.s, p0/m, z0\.s, z1\.s > ** ret > */ > -TEST_UNIFORM_Z (div_2_s32_x_untied, svint32_t, > - z0 = svdiv_n_s32_x (p0, z1, 2), > - z0 = svdiv_x (p0, z1, 2)) > +TEST_UNIFORM_Z (div_3_s32_x_untied, svint32_t, > + z0 = svdiv_n_s32_x (p0, z1, 3), > + z0 = svdiv_x (p0, z1, 3)) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c > index 464dca28d74..e4af406344b 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c > @@ -54,25 +54,25 @@ TEST_UNIFORM_ZX (div_x0_s64_m_untied, svint64_t, int64_t, > z0 = svdiv_m (p0, z1, x0)) > > /* > -** div_2_s64_m_tied1: > -** mov (z[0-9]+\.d), #2 > +** div_3_s64_m_tied1: > +** mov (z[0-9]+\.d), #3 > ** sdiv z0\.d, p0/m, z0\.d, \1 > ** ret > */ > -TEST_UNIFORM_Z (div_2_s64_m_tied1, svint64_t, > - z0 = svdiv_n_s64_m (p0, z0, 2), > - z0 = svdiv_m (p0, z0, 2)) > +TEST_UNIFORM_Z (div_3_s64_m_tied1, svint64_t, > + z0 = svdiv_n_s64_m (p0, z0, 3), > + z0 = svdiv_m (p0, z0, 3)) > > /* > -** div_2_s64_m_untied: > -** mov (z[0-9]+\.d), #2 > +** div_3_s64_m_untied: > +** mov (z[0-9]+\.d), #3 > ** movprfx z0, z1 > ** sdiv z0\.d, p0/m, z0\.d, \1 > ** ret > */ > -TEST_UNIFORM_Z (div_2_s64_m_untied, svint64_t, > - z0 = svdiv_n_s64_m (p0, z1, 2), > - z0 = svdiv_m (p0, z1, 2)) > +TEST_UNIFORM_Z (div_3_s64_m_untied, svint64_t, > + z0 = svdiv_n_s64_m (p0, z1, 3), > + z0 = svdiv_m (p0, z1, 3)) > > /* > ** div_s64_z_tied1: > @@ -137,19 +137,19 @@ TEST_UNIFORM_ZX (div_x0_s64_z_untied, svint64_t, > int64_t, > z0 = svdiv_z (p0, z1, x0)) > > /* > -** div_2_s64_z_tied1: > -** mov (z[0-9]+\.d), #2 > +** div_3_s64_z_tied1: > +** mov (z[0-9]+\.d), #3 > ** movprfx z0\.d, p0/z, z0\.d > ** sdiv z0\.d, p0/m, z0\.d, \1 > ** ret > */ > -TEST_UNIFORM_Z (div_2_s64_z_tied1, svint64_t, > - z0 = svdiv_n_s64_z (p0, z0, 2), > - z0 = svdiv_z (p0, z0, 2)) > +TEST_UNIFORM_Z (div_3_s64_z_tied1, svint64_t, > + z0 = svdiv_n_s64_z (p0, z0, 3), > + z0 = svdiv_z (p0, z0, 3)) > > /* > -** div_2_s64_z_untied: > -** mov (z[0-9]+\.d), #2 > +** div_3_s64_z_untied: > +** mov (z[0-9]+\.d), #3 > ** ( > ** movprfx z0\.d, p0/z, z1\.d > ** sdiv z0\.d, p0/m, z0\.d, \1 > @@ -159,9 +159,9 @@ TEST_UNIFORM_Z (div_2_s64_z_tied1, svint64_t, > ** ) > ** ret > */ > -TEST_UNIFORM_Z (div_2_s64_z_untied, svint64_t, > - z0 = svdiv_n_s64_z (p0, z1, 2), > - z0 = svdiv_z (p0, z1, 2)) > +TEST_UNIFORM_Z (div_3_s64_z_untied, svint64_t, > + z0 = svdiv_n_s64_z (p0, z1, 3), > + z0 = svdiv_z (p0, z1, 3)) > > /* > ** div_s64_x_tied1: > @@ -217,21 +217,21 @@ TEST_UNIFORM_ZX (div_x0_s64_x_untied, svint64_t, > int64_t, > z0 = svdiv_x (p0, z1, x0)) > > /* > -** div_2_s64_x_tied1: > -** mov (z[0-9]+\.d), #2 > +** div_3_s64_x_tied1: > +** mov (z[0-9]+\.d), #3 > ** sdiv z0\.d, p0/m, z0\.d, \1 > ** ret > */ > -TEST_UNIFORM_Z (div_2_s64_x_tied1, svint64_t, > - z0 = svdiv_n_s64_x (p0, z0, 2), > - z0 = svdiv_x (p0, z0, 2)) > +TEST_UNIFORM_Z (div_3_s64_x_tied1, svint64_t, > + z0 = svdiv_n_s64_x (p0, z0, 3), > + z0 = svdiv_x (p0, z0, 3)) > > /* > -** div_2_s64_x_untied: > -** mov z0\.d, #2 > +** div_3_s64_x_untied: > +** mov z0\.d, #3 > ** sdivr z0\.d, p0/m, z0\.d, z1\.d > ** ret > */ > -TEST_UNIFORM_Z (div_2_s64_x_untied, svint64_t, > - z0 = svdiv_n_s64_x (p0, z1, 2), > - z0 = svdiv_x (p0, z1, 2)) > +TEST_UNIFORM_Z (div_3_s64_x_untied, svint64_t, > + z0 = svdiv_n_s64_x (p0, z1, 3), > + z0 = svdiv_x (p0, z1, 3)) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/div_const_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/div_const_1.c > new file mode 100644 > index 00000000000..ac6ef1c73d4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/div_const_1.c > @@ -0,0 +1,34 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msve-vector-bits=128" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > + > +#include <arm_sve.h> > + > +typedef svbool_t pred __attribute__((arm_sve_vector_bits(128))); > +typedef svint64_t svint64_2 __attribute__((arm_sve_vector_bits(128))); > +typedef svuint64_t svuint64_2 __attribute__((arm_sve_vector_bits(128))); > + > +/* > +** f1: > +** ptrue (p[0-7])\.b, vl16 > +** asrd (z[0-9]+\.d), \1/m, \2, #2 > +** ret > +*/ > +svint64_2 f1 (svint64_2 p) > +{ > + const pred pg = svptrue_b64 (); > + return svdiv_x (pg, p, (svint64_2) {4, 4}); > +} > + > +/* > +** f2: > +** ptrue (p[0-7])\.b, vl16 > +** mov (z[0-9]+\.d), #4 > +** udiv (z[0-9]+\.d), \1/m, \3, \2 > +** ret > +*/ > +svuint64_2 f2 (svuint64_2 p) > +{ > + const pred pg = svptrue_b64 (); > + return svdiv_x (pg, p, (svuint64_2) {4, 4}); > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/div_const_1_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/div_const_1_run.c > new file mode 100644 > index 00000000000..a15c597d5bd > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/div_const_1_run.c > @@ -0,0 +1,91 @@ > +/* { dg-do run { target aarch64_sve_hw } } */ > +/* { dg-options "-O2 -msve-vector-bits=128" } */ > + > +#include <arm_sve.h> > +#include <stdint.h> > + > +typedef svbool_t pred __attribute__((arm_sve_vector_bits(128))); > +typedef svfloat16_t svfloat16_ __attribute__((arm_sve_vector_bits(128))); > +typedef svfloat32_t svfloat32_ __attribute__((arm_sve_vector_bits(128))); > +typedef svfloat64_t svfloat64_ __attribute__((arm_sve_vector_bits(128))); > +typedef svint32_t svint32_ __attribute__((arm_sve_vector_bits(128))); > +typedef svint64_t svint64_ __attribute__((arm_sve_vector_bits(128))); > +typedef svuint32_t svuint32_ __attribute__((arm_sve_vector_bits(128))); > +typedef svuint64_t svuint64_ __attribute__((arm_sve_vector_bits(128))); > + > +#define T1(TY, TYS, P) > \ > +{ \ > + TY##_t a = (TY##_t) 79; \ > + TY##_t b = (TY##_t) 16; \ > + sv##TY##_ res = svdiv_##P (pg, svdup_##TYS (a), svdup_##TYS (b)); \ > + sv##TY##_ exp = svdup_##TYS (a / b); > \ > + if (svptest_any (pg, svcmpne (pg, exp, res))) > \ > + __builtin_abort (); > \ > +} > + > +#define T2(B) > \ > +{ \ > + int##B##_t a[] = {0, -1, 1, INT##B##_MAX, INT##B##_MIN, -5, 5}; \ > + int##B##_t b[] = {-1, 1, -4, 4, -5, 5, INT##B##_MAX, INT##B##_MIN}; > \ > + int length_a = sizeof (a) / sizeof (a[0]); \ > + int length_b = sizeof (b) / sizeof (b[0]); \ > + for (int i = 0; i < length_a; ++i) \ > + { > \ > + for (int j = 0; j < length_b; ++j) \ > + { \ > + svint##B##_ op1 = svdup_s##B (a[i]); \ > + svint##B##_ op2 = svdup_s##B (b[j]); \ > + svint##B##_ res = svdiv_x (pg, op1, op2); \ > + svint##B##_ exp = svdup_s##B (a[i] / b[j]); \ > + if (svptest_any (pg, svcmpne (pg, exp, res))) \ > + __builtin_abort (); \ > + } \ > + } > \ > +} > + > +#define TEST_VALUES_ASRD2 \ > +{ \ > + svint32_ op1_32 = (svint32_) {0, 16, -79, -1}; \ > + svint32_ op2_32 = (svint32_) {5, 8, -32, 1}; > \ > + svint32_ res_32 = svdiv_x (pg, op1_32, op2_32); \ > + svint32_ exp_32 = (svint32_) {0 / 5, 16 / 8, -79 / -32, -1 / 1}; \ > + if (svptest_any (pg, svcmpne (pg, exp_32, res_32))) > \ > + __builtin_abort (); > \ > + \ > + svint64_ op1_64 = (svint64_) {83, -11}; \ > + svint64_ op2_64 = (svint64_) {16, 5}; > \ > + svint64_ res_64 = svdiv_x (pg, op1_64, op2_64); \ > + svint64_ exp_64 = (svint64_) {83 / 16, -11 / 5}; \ > + if (svptest_any (pg, svcmpne (pg, exp_64, res_64))) > \ > + __builtin_abort (); > \ > +} > + > +#define TEST_TYPES(T) > \ > + T (float16, f16, x) > \ > + T (float32, f32, x) > \ > + T (float64, f64, x) > \ > + T (int32, s32, x) \ > + T (int64, s64, x) \ > + T (uint32, u32, x) \ > + T (uint64, u64, x) \ > + > +#define TEST_PREDICATION(T) \ > + T (int32, s32, z) \ > + T (int32, s32, m) \ > + T (int64, s64, z) \ > + T (int64, s64, m) \ > + > +#define TEST_VALUES_ASRD1(T) > \ > + T (32) \ > + T (64) \ > + > +int > +main (void) > +{ > + const pred pg = svptrue_b64 (); > + TEST_TYPES (T1) > + TEST_PREDICATION (T1) > + TEST_VALUES_ASRD1 (T2) > + TEST_VALUES_ASRD2 > + return 0; > +}