Jennifer Schmitz <jschm...@nvidia.com> writes: > This patch folds svdiv where one of the operands is all-zeros to a zero > vector, if the predicate is ptrue or the predication is _x or _z. > This case was not covered by the recent patch that implemented constant > folding, because that covered only cases where both operands are > constant vectors. Here, the operation is folded as soon as one of the operands > is a constant zero vector. > Folding of divison by 0 to return 0 is in accordance with > the semantics of sdiv and udiv. > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. > OK for mainline? > > Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> > > gcc/ > * config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl::fold): > Add folding of all-zero operands to zero vector. > > gcc/testsuite/ > * gcc.target/aarch64/sve/fold_div_zero.c: New test. > * gcc.target/aarch64/sve/const_fold_div_1.c: Adjust expected > outcome. > > From 1d50cc57cd3bbe19a48b7bbb543ea331cbd9a6f6 Mon Sep 17 00:00:00 2001 > From: Jennifer Schmitz <jschm...@nvidia.com> > Date: Mon, 2 Sep 2024 06:46:57 -0700 > Subject: [PATCH] SVE intrinsics: Fold svdiv with all-zero operands to zero > vector > > This patch folds svdiv where one of the operands is all-zeros to a zero > vector, if the predicate is ptrue or the predication is _x or _z. > This case was not covered by the recent patch that implemented constant > folding, because that covered only cases where both operands are > constant vectors. Here, the operation is folded as soon as one of the operands > is a constant zero vector. > Folding of divison by 0 to return 0 is in accordance with > the semantics of sdiv and udiv. > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. > OK for mainline? > > Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> > > gcc/ > * config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl::fold): > Add folding of all-zero operands to zero vector. > > gcc/testsuite/ > * gcc.target/aarch64/sve/fold_div_zero.c: New test. > * gcc.target/aarch64/sve/const_fold_div_1.c: Adjust expected > outcome. > --- > .../aarch64/aarch64-sve-builtins-base.cc | 38 +- > .../gcc.target/aarch64/sve/const_fold_div_1.c | 12 +- > .../gcc.target/aarch64/sve/fold_div_zero.c | 369 ++++++++++++++++++ > 3 files changed, 402 insertions(+), 17 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/fold_div_zero.c > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > index 6c94d144dc9..3ec9ebbf6ef 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > @@ -758,30 +758,50 @@ public: > if (auto *res = f.fold_const_binary (TRUNC_DIV_EXPR)) > return res; > > - /* If the divisor is a uniform power of 2, fold to a shift > - instruction. */ > + tree pg = gimple_call_arg (f.call, 0); > + tree op1 = gimple_call_arg (f.call, 1); > tree op2 = gimple_call_arg (f.call, 2); > - tree divisor_cst = uniform_integer_cst_p (op2); > + bool pred_fold = f.pred != PRED_m > + || is_ptrue (pg, f.type_suffix (0).element_bytes); > > - if (!divisor_cst || !integer_pow2p (divisor_cst)) > + /* If the dividend is all zeros, fold to zero vector. */ > + tree op1_cst = uniform_integer_cst_p (op1); > + if (op1_cst && pred_fold && integer_zerop (op1_cst)) > + return gimple_build_assign (f.lhs, op1);
This fold is ok for all predication types, since _m merges with the first input. There's also no need to apply uniform_integer_cst_p manually, since integer_zerop handles vectors too. So I think this can be: /* If the dividend is all zeros, fold to zero vector. */ if (integer_zerop (op1)) return gimple_build_assign (f.lhs, op1); (The new _m cases would need tests though!) > + > + /* If the divisor is all zeros, fold to zero vector. */ > + tree op2_cst = uniform_integer_cst_p (op2); > + if (!op2_cst) > return NULL; > > + if (pred_fold && integer_zerop (op2_cst)) > + { > + gimple_seq stmts = NULL; > + tree op2_vec = f.force_vector (stmts, TREE_TYPE (op1), op2); > + gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT); > + return gimple_build_assign (f.lhs, op2_vec); > + } This would be simpler as: if (integer_zerop (op2_cst) && (f.pred != PRED_m || is_ptrue (pg, f.type_suffix (0).element_bytes))) return gimple_build_assign (f.lhs, build_zero_cst (TREE_TYPE (f.lhs))); (I've dropped the pred_fold variable, since it is only valid for things that fold to zero. For everything else we'd need == PRED_x instead.) > + > + /* If the divisor is a uniform power of 2, fold to a shift > + instruction. */ > + if (!integer_pow2p (op2_cst)) > + return NULL; > tree new_divisor; > gcall *call; Very minor nit, but: given the line spacing in the function, I think it'd look better to have a blank line after the return. Thanks, Richard > - if (f.type_suffix (0).unsigned_p && tree_to_uhwi (divisor_cst) != 1) > + if (f.type_suffix (0).unsigned_p && tree_to_uhwi (op2_cst) != 1) > { > function_instance instance ("svlsr", functions::svlsr, > shapes::binary_uint_opt_n, MODE_n, > f.type_suffix_ids, GROUP_none, f.pred); > call = f.redirect_call (instance); > - tree d = INTEGRAL_TYPE_P (TREE_TYPE (op2)) ? op2 : divisor_cst; > + tree d = INTEGRAL_TYPE_P (TREE_TYPE (op2)) ? op2 : op2_cst; > new_divisor = wide_int_to_tree (TREE_TYPE (d), tree_log2 (d)); > } > else > { > - if (tree_int_cst_sign_bit (divisor_cst) > - || tree_to_shwi (divisor_cst) == 1) > + if (tree_int_cst_sign_bit (op2_cst) > + || tree_to_shwi (op2_cst) == 1) > return NULL; > > function_instance instance ("svasrd", functions::svasrd, > @@ -789,7 +809,7 @@ public: > f.type_suffix_ids, GROUP_none, f.pred); > call = f.redirect_call (instance); > new_divisor = wide_int_to_tree (scalar_types[VECTOR_TYPE_svuint64_t], > - tree_log2 (divisor_cst)); > + tree_log2 (op2_cst)); > } > > gimple_call_set_arg (call, 2, new_divisor); > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c > index c15b3fc3aa0..92e0005c0fe 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c > @@ -45,7 +45,7 @@ svint64_t s64_z_pg (svbool_t pg) > > /* > ** s64_z_pg_0: > -** mov z[0-9]+\.d, p[0-7]/z, #0 > +** mov z[0-9]+\.b, #0 > ** ret > */ > svint64_t s64_z_pg_0 (svbool_t pg) > @@ -55,9 +55,7 @@ svint64_t s64_z_pg_0 (svbool_t pg) > > /* > ** s64_z_pg_by0: > -** mov (z[0-9]+\.d), #5 > -** mov (z[0-9]+)\.b, #0 > -** sdivr \2\.d, p[0-7]/m, \2\.d, \1 > +** mov z[0-9]+\.b, #0 > ** ret > */ > svint64_t s64_z_pg_by0 (svbool_t pg) > @@ -149,7 +147,7 @@ svint64_t s64_z_pg_n (svbool_t pg) > > /* > ** s64_z_pg_n_s64_0: > -** mov z[0-9]+\.d, p[0-7]/z, #0 > +** mov z[0-9]+\.b, #0 > ** ret > */ > svint64_t s64_z_pg_n_s64_0 (svbool_t pg) > @@ -159,9 +157,7 @@ svint64_t s64_z_pg_n_s64_0 (svbool_t pg) > > /* > ** s64_z_pg_n_s64_by0: > -** mov (z[0-9]+\.d), #5 > -** mov (z[0-9]+)\.b, #0 > -** sdivr \2\.d, p[0-7]/m, \2\.d, \1 > +** mov z[0-9]+\.b, #0 > ** ret > */ > svint64_t s64_z_pg_n_s64_by0 (svbool_t pg) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_div_zero.c > b/gcc/testsuite/gcc.target/aarch64/sve/fold_div_zero.c > new file mode 100644 > index 00000000000..be4a7353da0 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_div_zero.c > @@ -0,0 +1,369 @@ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* { dg-options "-O2" } */ > + > +#include "arm_sve.h" > + > +/* > +** s64_x_pg_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_x_pg_op1 (svbool_t pg, svint64_t op2) > +{ > + return svdiv_x (pg, svdup_s64 (0), op2); > +} > + > +/* > +** s64_z_pg_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_z_pg_op1 (svbool_t pg, svint64_t op2) > +{ > + return svdiv_z (pg, svdup_s64 (0), op2); > +} > + > +/* > +** s64_m_pg_op1: > +** mov z[0-9]+\.d, p[0-7]/z, #0 > +** ret > +*/ > +svint64_t s64_m_pg_op1 (svbool_t pg, svint64_t op2) > +{ > + return svdiv_m (pg, svdup_s64 (0), op2); > +} > + > +/* > +** s64_x_ptrue_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_x_ptrue_op1 (svint64_t op2) > +{ > + return svdiv_x (svptrue_b64 (), svdup_s64 (0), op2); > +} > + > +/* > +** s64_z_ptrue_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_z_ptrue_op1 (svint64_t op2) > +{ > + return svdiv_z (svptrue_b64 (), svdup_s64 (0), op2); > +} > + > +/* > +** s64_m_ptrue_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_m_ptrue_op1 (svint64_t op2) > +{ > + return svdiv_m (svptrue_b64 (), svdup_s64 (0), op2); > +} > + > +/* > +** s64_x_pg_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_x_pg_op2 (svbool_t pg, svint64_t op1) > +{ > + return svdiv_x (pg, op1, svdup_s64 (0)); > +} > + > +/* > +** s64_z_pg_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_z_pg_op2 (svbool_t pg, svint64_t op1) > +{ > + return svdiv_z (pg, op1, svdup_s64 (0)); > +} > + > +/* > +** s64_m_pg_op2: > +** mov (z[0-9]+)\.b, #0 > +** sdiv (z[0-9]\.d), p[0-7]/m, \2, \1\.d > +** ret > +*/ > +svint64_t s64_m_pg_op2 (svbool_t pg, svint64_t op1) > +{ > + return svdiv_m (pg, op1, svdup_s64 (0)); > +} > + > +/* > +** s64_x_ptrue_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_x_ptrue_op2 (svint64_t op1) > +{ > + return svdiv_x (svptrue_b64 (), op1, svdup_s64 (0)); > +} > + > +/* > +** s64_z_ptrue_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_z_ptrue_op2 (svint64_t op1) > +{ > + return svdiv_z (svptrue_b64 (), op1, svdup_s64 (0)); > +} > + > +/* > +** s64_m_ptrue_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_m_ptrue_op2 (svint64_t op1) > +{ > + return svdiv_m (svptrue_b64 (), op1, svdup_s64 (0)); > +} > + > +/* > +** s64_n_x_pg_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_n_x_pg_op2 (svbool_t pg, svint64_t op1) > +{ > + return svdiv_n_s64_x (pg, op1, 0); > +} > + > +/* > +** s64_n_z_pg_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_n_z_pg_op2 (svbool_t pg, svint64_t op1) > +{ > + return svdiv_n_s64_z (pg, op1, 0); > +} > + > +/* > +** s64_n_m_pg_op2: > +** mov (z[0-9]+)\.b, #0 > +** sdiv (z[0-9]+\.d), p[0-7]/m, \2, \1\.d > +** ret > +*/ > +svint64_t s64_n_m_pg_op2 (svbool_t pg, svint64_t op1) > +{ > + return svdiv_n_s64_m (pg, op1, 0); > +} > + > +/* > +** s64_n_x_ptrue_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_n_x_ptrue_op2 (svint64_t op1) > +{ > + return svdiv_n_s64_x (svptrue_b64 (), op1, 0); > +} > + > +/* > +** s64_n_z_ptrue_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_n_z_ptrue_op2 (svint64_t op1) > +{ > + return svdiv_n_s64_z (svptrue_b64 (), op1, 0); > +} > + > +/* > +** s64_n_m_ptrue_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_n_m_ptrue_op2 (svint64_t op1) > +{ > + return svdiv_n_s64_m (svptrue_b64 (), op1, 0); > +} > + > +/* > +** u64_x_pg_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_x_pg_op1 (svbool_t pg, svuint64_t op2) > +{ > + return svdiv_x (pg, svdup_u64 (0), op2); > +} > + > +/* > +** u64_z_pg_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_z_pg_op1 (svbool_t pg, svuint64_t op2) > +{ > + return svdiv_z (pg, svdup_u64 (0), op2); > +} > + > +/* > +** u64_m_pg_op1: > +** mov z[0-9]+\.d, p[0-7]/z, #0 > +** ret > +*/ > +svuint64_t u64_m_pg_op1 (svbool_t pg, svuint64_t op2) > +{ > + return svdiv_m (pg, svdup_u64 (0), op2); > +} > + > +/* > +** u64_x_ptrue_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_x_ptrue_op1 (svuint64_t op2) > +{ > + return svdiv_x (svptrue_b64 (), svdup_u64 (0), op2); > +} > + > +/* > +** u64_z_ptrue_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_z_ptrue_op1 (svuint64_t op2) > +{ > + return svdiv_z (svptrue_b64 (), svdup_u64 (0), op2); > +} > + > +/* > +** u64_m_ptrue_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_m_ptrue_op1 (svuint64_t op2) > +{ > + return svdiv_m (svptrue_b64 (), svdup_u64 (0), op2); > +} > + > +/* > +** u64_x_pg_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_x_pg_op2 (svbool_t pg, svuint64_t op1) > +{ > + return svdiv_x (pg, op1, svdup_u64 (0)); > +} > + > +/* > +** u64_z_pg_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_z_pg_op2 (svbool_t pg, svuint64_t op1) > +{ > + return svdiv_z (pg, op1, svdup_u64 (0)); > +} > + > +/* > +** u64_m_pg_op2: > +** mov (z[0-9]+)\.b, #0 > +** udiv (z[0-9]+\.d), p[0-7]/m, \2, \1\.d > +** ret > +*/ > +svuint64_t u64_m_pg_op2 (svbool_t pg, svuint64_t op1) > +{ > + return svdiv_m (pg, op1, svdup_u64 (0)); > +} > + > +/* > +** u64_x_ptrue_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_x_ptrue_op2 (svuint64_t op1) > +{ > + return svdiv_x (svptrue_b64 (), op1, svdup_u64 (0)); > +} > + > +/* > +** u64_z_ptrue_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_z_ptrue_op2 (svuint64_t op1) > +{ > + return svdiv_z (svptrue_b64 (), op1, svdup_u64 (0)); > +} > + > +/* > +** u64_m_ptrue_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_m_ptrue_op2 (svuint64_t op1) > +{ > + return svdiv_m (svptrue_b64 (), op1, svdup_u64 (0)); > +} > + > +/* > +** u64_n_x_pg_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_n_x_pg_op2 (svbool_t pg, svuint64_t op1) > +{ > + return svdiv_n_u64_x (pg, op1, 0); > +} > + > +/* > +** u64_n_z_pg_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_n_z_pg_op2 (svbool_t pg, svuint64_t op1) > +{ > + return svdiv_n_u64_z (pg, op1, 0); > +} > + > +/* > +** u64_n_m_pg_op2: > +** mov (z[0-9]+)\.b, #0 > +** udiv (z[0-9]+\.d), p[0-7]/m, \2, \1\.d > +** ret > +*/ > +svuint64_t u64_n_m_pg_op2 (svbool_t pg, svuint64_t op1) > +{ > + return svdiv_n_u64_m (pg, op1, 0); > +} > + > +/* > +** u64_n_x_ptrue_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_n_x_ptrue_op2 (svuint64_t op1) > +{ > + return svdiv_n_u64_x (svptrue_b64 (), op1, 0); > +} > + > +/* > +** u64_n_z_ptrue_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_n_z_ptrue_op2 (svuint64_t op1) > +{ > + return svdiv_n_u64_z (svptrue_b64 (), op1, 0); > +} > + > +/* > +** u64_n_m_ptrue_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_n_m_ptrue_op2 (svuint64_t op1) > +{ > + return svdiv_n_u64_m (svptrue_b64 (), op1, 0); > +} > +