> On 5 Aug 2024, at 12:01, Richard Sandiford <richard.sandif...@arm.com> wrote: > > External email: Use caution opening links or attachments > > > Jennifer Schmitz <jschm...@nvidia.com> writes: >> This patch folds the SVE intrinsic svdiv into a vector of 1's in case >> 1) the predicate is svptrue and >> 2) dividend and divisor are equal. >> This is implemented in the gimple_folder for signed and unsigned >> integers. Corresponding test cases were added to the existing test >> suites. >> >> The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. >> OK for mainline? >> >> Please also advise whether it makes sense to implement the same optimization >> for float types and if so, under which conditions? > > I think we should instead use const_binop to try to fold the division > whenever the predicate is all-true, or if the function uses _x predication. > (As a follow-on, we could handle _z and _m too, using VEC_COND_EXPR.) >
From what I can see const_binop only works on constant arguments. Is fold_binary a better interface to use ? I think it’d hook into the match.pd machinery for divisions at some point. Thanks, Kyrill > We shouldn't need to vet the arguments, since const_binop does that itself. > Using const_binop should also get the conditions right for floating-point > divisions. > > Thanks, > Richard > > >> >> Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> >> >> gcc/ >> >> * config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl::fold): >> Add optimization. >> >> gcc/testsuite/ >> >> * gcc.target/aarch64/sve/acle/asm/div_s32.c: New test. >> * gcc.target/aarch64/sve/acle/asm/div_s64.c: Likewise. >> * gcc.target/aarch64/sve/acle/asm/div_u32.c: Likewise. >> * gcc.target/aarch64/sve/acle/asm/div_u64.c: Likewise. >> >> From 43913cfa47b31d055a0456c863a30e3e44acc2f0 Mon Sep 17 00:00:00 2001 >> From: Jennifer Schmitz <jschm...@nvidia.com> >> Date: Fri, 2 Aug 2024 06:41:09 -0700 >> Subject: [PATCH] SVE intrinsics: Fold svdiv (svptrue, x, x) to ones >> >> This patch folds the SVE intrinsic svdiv into a vector of 1's in case >> 1) the predicate is svptrue and >> 2) dividend and divisor are equal. >> This is implemented in the gimple_folder for signed and unsigned >> integers. Corresponding test cases were added to the existing test >> suites. >> >> The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. >> OK for mainline? >> >> Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> >> >> gcc/ >> >> * config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl::fold): >> Add optimization. >> >> gcc/testsuite/ >> >> * gcc.target/aarch64/sve/acle/asm/div_s32.c: New test. >> * gcc.target/aarch64/sve/acle/asm/div_s64.c: Likewise. >> * gcc.target/aarch64/sve/acle/asm/div_u32.c: Likewise. >> * gcc.target/aarch64/sve/acle/asm/div_u64.c: Likewise. >> --- >> .../aarch64/aarch64-sve-builtins-base.cc | 19 ++++++++++--- >> .../gcc.target/aarch64/sve/acle/asm/div_s32.c | 27 +++++++++++++++++++ >> .../gcc.target/aarch64/sve/acle/asm/div_s64.c | 27 +++++++++++++++++++ >> .../gcc.target/aarch64/sve/acle/asm/div_u32.c | 27 +++++++++++++++++++ >> .../gcc.target/aarch64/sve/acle/asm/div_u64.c | 27 +++++++++++++++++++ >> 5 files changed, 124 insertions(+), 3 deletions(-) >> >> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> index d55bee0b72f..e347d29c725 100644 >> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> @@ -755,8 +755,21 @@ public: >> gimple * >> fold (gimple_folder &f) const override >> { >> - tree divisor = gimple_call_arg (f.call, 2); >> - tree divisor_cst = uniform_integer_cst_p (divisor); >> + tree pg = gimple_call_arg (f.call, 0); >> + tree op1 = gimple_call_arg (f.call, 1); >> + tree op2 = gimple_call_arg (f.call, 2); >> + >> + if (f.type_suffix (0).integer_p >> + && is_ptrue (pg, f.type_suffix (0).element_bytes) >> + && operand_equal_p (op1, op2, 0)) >> + { >> + tree lhs_type = TREE_TYPE (f.lhs); >> + tree_vector_builder builder (lhs_type, 1, 1); >> + builder.quick_push (build_each_one_cst (TREE_TYPE (lhs_type))); >> + return gimple_build_assign (f.lhs, builder.build ()); >> + } >> + >> + tree divisor_cst = uniform_integer_cst_p (op2); >> >> if (!divisor_cst || !integer_pow2p (divisor_cst)) >> return NULL; >> @@ -770,7 +783,7 @@ public: >> shapes::binary_uint_opt_n, MODE_n, >> f.type_suffix_ids, GROUP_none, f.pred); >> call = f.redirect_call (instance); >> - tree d = INTEGRAL_TYPE_P (TREE_TYPE (divisor)) ? divisor : divisor_cst; >> + tree d = INTEGRAL_TYPE_P (TREE_TYPE (op2)) ? op2 : divisor_cst; >> new_divisor = wide_int_to_tree (TREE_TYPE (d), tree_log2 (d)); >> } >> else >> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c >> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c >> index d5a23bf0726..09d0419f3ef 100644 >> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c >> @@ -55,6 +55,15 @@ TEST_UNIFORM_ZX (div_w0_s32_m_untied, svint32_t, int32_t, >> z0 = svdiv_n_s32_m (p0, z1, x0), >> z0 = svdiv_m (p0, z1, x0)) >> >> +/* >> +** div_same_s32_m_tied1: >> +** mov z0\.s, #1 >> +** ret >> +*/ >> +TEST_UNIFORM_Z (div_same_s32_m_tied1, svint32_t, >> + z0 = svdiv_s32_m (svptrue_b32 (), z0, z0), >> + z0 = svdiv_m (svptrue_b32 (), z0, z0)) >> + >> /* >> ** div_1_s32_m_tied1: >> ** sel z0\.s, p0, z0\.s, z0\.s >> @@ -215,6 +224,15 @@ TEST_UNIFORM_ZX (div_w0_s32_z_untied, svint32_t, >> int32_t, >> z0 = svdiv_n_s32_z (p0, z1, x0), >> z0 = svdiv_z (p0, z1, x0)) >> >> +/* >> +** div_same_s32_z_tied1: >> +** mov z0\.s, #1 >> +** ret >> +*/ >> +TEST_UNIFORM_Z (div_same_s32_z_tied1, svint32_t, >> + z0 = svdiv_s32_z (svptrue_b32 (), z0, z0), >> + z0 = svdiv_z (svptrue_b32 (), z0, z0)) >> + >> /* >> ** div_1_s32_z_tied1: >> ** mov (z[0-9]+\.s), #1 >> @@ -384,6 +402,15 @@ TEST_UNIFORM_ZX (div_w0_s32_x_untied, svint32_t, >> int32_t, >> z0 = svdiv_n_s32_x (p0, z1, x0), >> z0 = svdiv_x (p0, z1, x0)) >> >> +/* >> +** div_same_s32_x_tied1: >> +** mov z0\.s, #1 >> +** ret >> +*/ >> +TEST_UNIFORM_Z (div_same_s32_x_tied1, svint32_t, >> + z0 = svdiv_s32_x (svptrue_b32 (), z0, z0), >> + z0 = svdiv_x (svptrue_b32 (), z0, z0)) >> + >> /* >> ** div_1_s32_x_tied1: >> ** ret >> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c >> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c >> index cfed6f9c1b3..a9b3e1b0b89 100644 >> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c >> @@ -55,6 +55,15 @@ TEST_UNIFORM_ZX (div_x0_s64_m_untied, svint64_t, int64_t, >> z0 = svdiv_n_s64_m (p0, z1, x0), >> z0 = svdiv_m (p0, z1, x0)) >> >> +/* >> +** div_same_s64_m_tied1: >> +** mov z0\.d, #1 >> +** ret >> +*/ >> +TEST_UNIFORM_Z (div_same_s64_m_tied1, svint64_t, >> + z0 = svdiv_s64_m (svptrue_b64 (), z0, z0), >> + z0 = svdiv_m (svptrue_b64 (), z0, z0)) >> + >> /* >> ** div_1_s64_m_tied1: >> ** sel z0\.d, p0, z0\.d, z0\.d >> @@ -215,6 +224,15 @@ TEST_UNIFORM_ZX (div_x0_s64_z_untied, svint64_t, >> int64_t, >> z0 = svdiv_n_s64_z (p0, z1, x0), >> z0 = svdiv_z (p0, z1, x0)) >> >> +/* >> +** div_same_s64_z_tied1: >> +** mov z0\.d, #1 >> +** ret >> +*/ >> +TEST_UNIFORM_Z (div_same_s64_z_tied1, svint64_t, >> + z0 = svdiv_s64_z (svptrue_b64 (), z0, z0), >> + z0 = svdiv_z (svptrue_b64 (), z0, z0)) >> + >> /* >> ** div_1_s64_z_tied1: >> ** mov (z[0-9]+\.d), #1 >> @@ -384,6 +402,15 @@ TEST_UNIFORM_ZX (div_x0_s64_x_untied, svint64_t, >> int64_t, >> z0 = svdiv_n_s64_x (p0, z1, x0), >> z0 = svdiv_x (p0, z1, x0)) >> >> +/* >> +** div_same_s64_x_tied1: >> +** mov z0\.d, #1 >> +** ret >> +*/ >> +TEST_UNIFORM_Z (div_same_s64_x_tied1, svint64_t, >> + z0 = svdiv_s64_x (svptrue_b64 (), z0, z0), >> + z0 = svdiv_x (svptrue_b64 (), z0, z0)) >> + >> /* >> ** div_1_s64_x_tied1: >> ** ret >> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u32.c >> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u32.c >> index 9707664caf4..edd3d03dfce 100644 >> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u32.c >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u32.c >> @@ -55,6 +55,15 @@ TEST_UNIFORM_ZX (div_w0_u32_m_untied, svuint32_t, >> uint32_t, >> z0 = svdiv_n_u32_m (p0, z1, x0), >> z0 = svdiv_m (p0, z1, x0)) >> >> +/* >> +** div_same_u32_m_tied1: >> +** mov z0\.s, #1 >> +** ret >> +*/ >> +TEST_UNIFORM_Z (div_same_u32_m_tied1, svuint32_t, >> + z0 = svdiv_u32_m (svptrue_b32 (), z0, z0), >> + z0 = svdiv_m (svptrue_b32 (), z0, z0)) >> + >> /* >> ** div_1_u32_m_tied1: >> ** sel z0\.s, p0, z0\.s, z0\.s >> @@ -194,6 +203,15 @@ TEST_UNIFORM_ZX (div_w0_u32_z_untied, svuint32_t, >> uint32_t, >> z0 = svdiv_n_u32_z (p0, z1, x0), >> z0 = svdiv_z (p0, z1, x0)) >> >> +/* >> +** div_same_u32_z_tied1: >> +** mov z0\.s, #1 >> +** ret >> +*/ >> +TEST_UNIFORM_Z (div_same_u32_z_tied1, svuint32_t, >> + z0 = svdiv_u32_z (svptrue_b32 (), z0, z0), >> + z0 = svdiv_z (svptrue_b32 (), z0, z0)) >> + >> /* >> ** div_1_u32_z_tied1: >> ** mov (z[0-9]+\.s), #1 >> @@ -336,6 +354,15 @@ TEST_UNIFORM_ZX (div_w0_u32_x_untied, svuint32_t, >> uint32_t, >> z0 = svdiv_n_u32_x (p0, z1, x0), >> z0 = svdiv_x (p0, z1, x0)) >> >> +/* >> +** div_same_u32_x_tied1: >> +** mov z0\.s, #1 >> +** ret >> +*/ >> +TEST_UNIFORM_Z (div_same_u32_x_tied1, svuint32_t, >> + z0 = svdiv_u32_x (svptrue_b32 (), z0, z0), >> + z0 = svdiv_x (svptrue_b32 (), z0, z0)) >> + >> /* >> ** div_1_u32_x_tied1: >> ** ret >> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u64.c >> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u64.c >> index 5247ebdac7a..3f6e581f122 100644 >> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u64.c >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u64.c >> @@ -55,6 +55,15 @@ TEST_UNIFORM_ZX (div_x0_u64_m_untied, svuint64_t, >> uint64_t, >> z0 = svdiv_n_u64_m (p0, z1, x0), >> z0 = svdiv_m (p0, z1, x0)) >> >> +/* >> +** div_same_u64_m_tied1: >> +** mov z0\.d, #1 >> +** ret >> +*/ >> +TEST_UNIFORM_Z (div_same_u64_m_tied1, svuint64_t, >> + z0 = svdiv_u64_m (svptrue_b64 (), z0, z0), >> + z0 = svdiv_m (svptrue_b64 (), z0, z0)) >> + >> /* >> ** div_1_u64_m_tied1: >> ** sel z0\.d, p0, z0\.d, z0\.d >> @@ -194,6 +203,15 @@ TEST_UNIFORM_ZX (div_x0_u64_z_untied, svuint64_t, >> uint64_t, >> z0 = svdiv_n_u64_z (p0, z1, x0), >> z0 = svdiv_z (p0, z1, x0)) >> >> +/* >> +** div_same_u64_z_tied1: >> +** mov z0\.d, #1 >> +** ret >> +*/ >> +TEST_UNIFORM_Z (div_same_u64_z_tied1, svuint64_t, >> + z0 = svdiv_u64_z (svptrue_b64 (), z0, z0), >> + z0 = svdiv_z (svptrue_b64 (), z0, z0)) >> + >> /* >> ** div_1_u64_z_tied1: >> ** mov (z[0-9]+\.d), #1 >> @@ -336,6 +354,15 @@ TEST_UNIFORM_ZX (div_x0_u64_x_untied, svuint64_t, >> uint64_t, >> z0 = svdiv_n_u64_x (p0, z1, x0), >> z0 = svdiv_x (p0, z1, x0)) >> >> +/* >> +** div_same_u64_x_tied1: >> +** mov z0\.d, #1 >> +** ret >> +*/ >> +TEST_UNIFORM_Z (div_same_u64_x_tied1, svuint64_t, >> + z0 = svdiv_u64_x (svptrue_b64 (), z0, z0), >> + z0 = svdiv_x (svptrue_b64 (), z0, z0)) >> + >> /* >> ** div_1_u64_x_tied1: >> ** ret