Jennifer Schmitz <jschm...@nvidia.com> writes: > This patch implements constant folding for svdiv. A new gimple_folder > method was added that uses const_binop to fold binary operations using a > given tree_code. For svdiv, this method is used to fold constant > operands. > Additionally, if at least one of the operands is a zero vector, svdiv is > folded to a zero vector (in case of ptrue, _x, or _z). > Tests were added to check the produced assembly for different > predicates and signed and unsigned integers. > Currently, constant folding is only implemented for integers and binary > operations, but extending it to float types and other operations is > planned for a future follow-up. > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. > OK for mainline? > > Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> > > gcc/ > > * config/aarch64/aarch64-sve-builtins-base.cc > (svdiv_impl::fold): Add constant folding. > * config/aarch64/aarch64-sve-builtins.cc > (gimple_folder::const_fold): New method. > * config/aarch64/aarch64-sve-builtins.h > (gimple_folder::const_fold): Add function declaration. > > gcc/testsuite/ > > * gcc.target/aarch64/sve/const_fold_div_1.c: New test. > * gcc.target/aarch64/sve/const_fold_div_zero.c: Likewise. > > From 79355d876503558f661b46ebbeaa11c74ce176cb Mon Sep 17 00:00:00 2001 > From: Jennifer Schmitz <jschm...@nvidia.com> > Date: Thu, 15 Aug 2024 05:42:06 -0700 > Subject: [PATCH 1/2] SVE intrinsics: Fold constant operands for svdiv > > This patch implements constant folding for svdiv. A new gimple_folder > method was added that uses const_binop to fold binary operations using a > given tree_code. For svdiv, this method is used to fold constant > operands. > Additionally, if at least one of the operands is a zero vector, svdiv is > folded to a zero vector (in case of ptrue, _x, or _z). > Tests were added to check the produced assembly for different > predicates and signed and unsigned integers. > Currently, constant folding is only implemented for integers and binary > operations, but extending it to float types and other operations is > planned for a future follow-up. > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. > OK for mainline? > > Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> > > gcc/ > > * config/aarch64/aarch64-sve-builtins-base.cc > (svdiv_impl::fold): Add constant folding. > * config/aarch64/aarch64-sve-builtins.cc > (gimple_folder::const_fold): New method. > * config/aarch64/aarch64-sve-builtins.h > (gimple_folder::const_fold): Add function declaration. > > gcc/testsuite/ > > * gcc.target/aarch64/sve/const_fold_div_1.c: New test. > * gcc.target/aarch64/sve/const_fold_div_zero.c: Likewise. > --- > .../aarch64/aarch64-sve-builtins-base.cc | 30 ++- > gcc/config/aarch64/aarch64-sve-builtins.cc | 25 +++ > gcc/config/aarch64/aarch64-sve-builtins.h | 1 + > .../gcc.target/aarch64/sve/const_fold_div_1.c | 128 ++++++++++++ > .../aarch64/sve/const_fold_div_zero.c | 186 ++++++++++++++++++ > .../aarch64/sve/const_fold_mul_zero.c | 95 +++++++++ > 6 files changed, 462 insertions(+), 3 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_zero.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/const_fold_mul_zero.c > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > index d55bee0b72f..7f948ecc0c7 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > @@ -755,8 +755,32 @@ public: > gimple * > fold (gimple_folder &f) const override > { > - tree divisor = gimple_call_arg (f.call, 2); > - tree divisor_cst = uniform_integer_cst_p (divisor); > + tree pg = gimple_call_arg (f.call, 0); > + tree op1 = gimple_call_arg (f.call, 1); > + tree op2 = gimple_call_arg (f.call, 2); > + > + /* For integer division, if the dividend or divisor are all zeros, > + fold to zero vector. */ > + int step = f.type_suffix (0).element_bytes; > + if (f.pred != PRED_m || is_ptrue (pg, step)) > + { > + if (vector_cst_all_same (op1, step) > + && integer_zerop (VECTOR_CST_ENCODED_ELT (op1, 0))) > + return gimple_build_assign (f.lhs, op1); > + if (vector_cst_all_same (op2, step) > + && integer_zerop (VECTOR_CST_ENCODED_ELT (op2, 0))) > + return gimple_build_assign (f.lhs, op2); > + }
Rather than handle all-zeros as a special case here, I think we should try to do it elementwise in the const_binop. More below. > + > + /* Try to fold constant operands. */ > + tree_code m_code = f.type_suffix (0).integer_p ? TRUNC_DIV_EXPR > + : RDIV_EXPR; > + if (gimple *new_stmt = f.const_fold (m_code)) > + return new_stmt; > + > + /* If the divisor is a uniform power of 2, fold to a shift > + instruction. */ > + tree divisor_cst = uniform_integer_cst_p (op2); > > if (!divisor_cst || !integer_pow2p (divisor_cst)) > return NULL; > @@ -770,7 +794,7 @@ public: > shapes::binary_uint_opt_n, MODE_n, > f.type_suffix_ids, GROUP_none, f.pred); > call = f.redirect_call (instance); > - tree d = INTEGRAL_TYPE_P (TREE_TYPE (divisor)) ? divisor : divisor_cst; > + tree d = INTEGRAL_TYPE_P (TREE_TYPE (op2)) ? op2 : divisor_cst; > new_divisor = wide_int_to_tree (TREE_TYPE (d), tree_log2 (d)); > } > else > diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc > b/gcc/config/aarch64/aarch64-sve-builtins.cc > index 0a560eaedca..0f69c586464 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc > @@ -3691,6 +3691,31 @@ gimple_folder::fold_to_vl_pred (unsigned int vl) > return gimple_build_assign (lhs, builder.build ()); > } > > +/* If the predicate is svptrue or PRED_x, try to perform constant folding > + on the call using the given tree_code. > + Return the new statement on success, otherwise return null. */ > +gimple * > +gimple_folder::const_fold (tree_code code) > +{ > + tree pg = gimple_call_arg (call, 0); > + if (type_suffix (0).integer_p > + && (is_ptrue (pg, type_suffix (0).element_bytes) > + || pred == PRED_x)) > + { > + if (TREE_CODE_CLASS (code) == tcc_binary) > + { > + gcc_assert (gimple_call_num_args (call) == 3); > + tree op1 = gimple_call_arg (call, 1); > + tree op2 = gimple_call_arg (call, 2); > + if (TREE_TYPE (op1) != TREE_TYPE (op2)) > + return NULL; I assume this is rejecting the svdiv_n_* case, is that right? I think we should instead try to handle that too, since the _n variants are specifically provided as a convenience for uniform divisors. It looks like const_binop should just work for that case too, thanks to the shift handling. (AFAICT, the handling is not explicitly restricted to shifts.) But if it doesn't, I think it would be a reasonable extension. > + if (tree res = const_binop (code, TREE_TYPE (lhs), op1, op2)) > + return gimple_build_assign (lhs, res); Going back to the comment above about handling /0 elementwise: how about splitting the vector part of const_binop out into a new public function with the following interface: tree vector_const_binop (tree_code code, tree arg1, tree arg2, tree (*elt_const_binop) (code, tree, tree)) where "the vector part" is everything in the function after: if (TREE_CODE (arg1) == VECTOR_CST && TREE_CODE (arg2) == VECTOR_CST ... Then const_binop itself can just use: return vector_const_binop (code, arg1, arg2, const_binop); whereas aarch64 code can pass its own wrapper that handles the extra defined cases. +Richi in case he has any thoughts on this. I think the starting point for the aarch64 implementation should be something like: if (poly_int_tree_p (arg1) && poly_int_tree_p (arg2)) { poly_wide_int poly_res; tree type = TREE_TYPE (arg1); signop sign = TYPE_SIGN (type); wi::overflow_type overflow = wi::OVF_NONE; ...if chain of special cases... else if (!poly_int_binop (poly_res, code, arg1, arg2, sign, &overflow)) return NULL_TREE; return force_fit_type (type, poly_res, false, TREE_OVERFLOW (arg1) | TREE_OVERFLOW (arg2)); } return NULL_TREE; which is adapted from int_const_binop, and would need poly_int_binop to become a public function. The key thing here is that we completely ignore overflow in the calculation, because the semantics of the intrinsics are that language-level overflow does not happen. Thanks, Richard > + } > + } > + return NULL; > +} > + > /* Try to fold the call. Return the new statement on success and null > on failure. */ > gimple * > diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h > b/gcc/config/aarch64/aarch64-sve-builtins.h > index 9ab6f202c30..db30225a008 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins.h > +++ b/gcc/config/aarch64/aarch64-sve-builtins.h > @@ -636,6 +636,7 @@ public: > gimple *fold_to_pfalse (); > gimple *fold_to_ptrue (); > gimple *fold_to_vl_pred (unsigned int); > + gimple *const_fold (tree_code); > > gimple *fold (); > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c > new file mode 100644 > index 00000000000..d8460a4d336 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c > @@ -0,0 +1,128 @@ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* { dg-options "-O2" } */ > + > +#include "arm_sve.h" > + > +/* > +** s64_x_pg: > +** mov z[0-9]+\.d, #1 > +** ret > +*/ > +svint64_t s64_x_pg (svbool_t pg) > +{ > + return svdiv_x (pg, svdup_s64 (5), svdup_s64 (3)); > +} > + > +/* > +** s64_z_pg: > +** mov z[0-9]+\.d, p[0-7]/z, #1 > +** ret > +*/ > +svint64_t s64_z_pg (svbool_t pg) > +{ > + return svdiv_z (pg, svdup_s64 (5), svdup_s64 (3)); > +} > + > +/* > +** s64_m_pg: > +** mov (z[0-9]+\.d), #3 > +** mov (z[0-9]+\.d), #5 > +** sdiv \2, p[0-7]/m, \2, \1 > +** ret > +*/ > +svint64_t s64_m_pg (svbool_t pg) > +{ > + return svdiv_m (pg, svdup_s64 (5), svdup_s64 (3)); > +} > + > +/* > +** s64_x_ptrue: > +** mov z[0-9]+\.d, #1 > +** ret > +*/ > +svint64_t s64_x_ptrue () > +{ > + return svdiv_x (svptrue_b64 (), svdup_s64 (5), svdup_s64 (3)); > +} > + > +/* > +** s64_z_ptrue: > +** mov z[0-9]+\.d, #1 > +** ret > +*/ > +svint64_t s64_z_ptrue () > +{ > + return svdiv_z (svptrue_b64 (), svdup_s64 (5), svdup_s64 (3)); > +} > + > +/* > +** s64_m_ptrue: > +** mov z[0-9]+\.d, #1 > +** ret > +*/ > +svint64_t s64_m_ptrue () > +{ > + return svdiv_m (svptrue_b64 (), svdup_s64 (5), svdup_s64 (3)); > +} > + > +/* > +** u64_x_pg: > +** mov z[0-9]+\.d, #1 > +** ret > +*/ > +svuint64_t u64_x_pg (svbool_t pg) > +{ > + return svdiv_x (pg, svdup_u64 (5), svdup_u64 (3)); > +} > + > +/* > +** u64_z_pg: > +** mov z[0-9]+\.d, p[0-7]/z, #1 > +** ret > +*/ > +svuint64_t u64_z_pg (svbool_t pg) > +{ > + return svdiv_z (pg, svdup_u64 (5), svdup_u64 (3)); > +} > + > +/* > +** u64_m_pg: > +** mov (z[0-9]+\.d), #3 > +** mov (z[0-9]+\.d), #5 > +** udiv \2, p[0-7]/m, \2, \1 > +** ret > +*/ > +svuint64_t u64_m_pg (svbool_t pg) > +{ > + return svdiv_m (pg, svdup_u64 (5), svdup_u64 (3)); > +} > + > +/* > +** u64_x_ptrue: > +** mov z[0-9]+\.d, #1 > +** ret > +*/ > +svuint64_t u64_x_ptrue () > +{ > + return svdiv_x (svptrue_b64 (), svdup_u64 (5), svdup_u64 (3)); > +} > + > +/* > +** u64_z_ptrue: > +** mov z[0-9]+\.d, #1 > +** ret > +*/ > +svuint64_t u64_z_ptrue () > +{ > + return svdiv_z (svptrue_b64 (), svdup_u64 (5), svdup_u64 (3)); > +} > + > +/* > +** u64_m_ptrue: > +** mov z[0-9]+\.d, #1 > +** ret > +*/ > +svuint64_t u64_m_ptrue () > +{ > + return svdiv_m (svptrue_b64 (), svdup_u64 (5), svdup_u64 (3)); > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_zero.c > b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_zero.c > new file mode 100644 > index 00000000000..00d14a46ced > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_zero.c > @@ -0,0 +1,186 @@ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* { dg-options "-O2" } */ > + > +#include "arm_sve.h" > + > +/* > +** s64_x_pg_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_x_pg_op1 (svbool_t pg, svint64_t op2) > +{ > + return svdiv_x (pg, svdup_s64 (0), op2); > +} > + > +/* > +** s64_z_pg_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_z_pg_op1 (svbool_t pg, svint64_t op2) > +{ > + return svdiv_z (pg, svdup_s64 (0), op2); > +} > + > +/* > +** s64_m_pg_op1: > +** mov z[0-9]+\.d, p[0-7]/z, #0 > +** ret > +*/ > +svint64_t s64_m_pg_op1 (svbool_t pg, svint64_t op2) > +{ > + return svdiv_m (pg, svdup_s64 (0), op2); > +} > + > +/* > +** s64_x_pg_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_x_pg_op2 (svbool_t pg, svint64_t op1) > +{ > + return svdiv_x (pg, op1, svdup_s64 (0)); > +} > + > +/* > +** s64_z_pg_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_z_pg_op2 (svbool_t pg, svint64_t op1) > +{ > + return svdiv_z (pg, op1, svdup_s64 (0)); > +} > + > +/* > +** s64_m_pg_op2: > +** mov (z[0-9]+)\.b, #0 > +** sdiv (z[0-9]+\.d), p[0-7]/m, \2, \1\.d > +** ret > +*/ > +svint64_t s64_m_pg_op2 (svbool_t pg, svint64_t op1) > +{ > + return svdiv_m (pg, op1, svdup_s64 (0)); > +} > + > +/* > +** s64_m_ptrue_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_m_ptrue_op1 (svint64_t op2) > +{ > + return svdiv_m (svptrue_b64 (), svdup_s64 (0), op2); > +} > + > +/* > +** s64_m_ptrue_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_m_ptrue_op2 (svint64_t op1) > +{ > + return svdiv_m (svptrue_b64 (), op1, svdup_s64 (0)); > +} > + > +/* > +** s64_m_ptrue_op1_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_m_ptrue_op1_op2 () > +{ > + return svdiv_m (svptrue_b64 (), svdup_s64 (0), svdup_s64 (0)); > +} > + > +/* > +** u64_x_pg_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_x_pg_op1 (svbool_t pg, svuint64_t op2) > +{ > + return svdiv_x (pg, svdup_u64 (0), op2); > +} > + > +/* > +** u64_z_pg_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_z_pg_op1 (svbool_t pg, svuint64_t op2) > +{ > + return svdiv_z (pg, svdup_u64 (0), op2); > +} > + > +/* > +** u64_m_pg_op1: > +** mov z[0-9]+\.d, p[0-7]/z, #0 > +** ret > +*/ > +svuint64_t u64_m_pg_op1 (svbool_t pg, svuint64_t op2) > +{ > + return svdiv_m (pg, svdup_u64 (0), op2); > +} > + > +/* > +** u64_x_pg_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_x_pg_op2 (svbool_t pg, svuint64_t op1) > +{ > + return svdiv_x (pg, op1, svdup_u64 (0)); > +} > + > +/* > +** u64_z_pg_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_z_pg_op2 (svbool_t pg, svuint64_t op1) > +{ > + return svdiv_z (pg, op1, svdup_u64 (0)); > +} > + > +/* > +** u64_m_pg_op2: > +** mov (z[0-9]+)\.b, #0 > +** udiv (z[0-9]+\.d), p[0-7]/m, \2, \1\.d > +** ret > +*/ > +svuint64_t u64_m_pg_op2 (svbool_t pg, svuint64_t op1) > +{ > + return svdiv_m (pg, op1, svdup_u64 (0)); > +} > + > +/* > +** u64_m_ptrue_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_m_ptrue_op1 (svuint64_t op2) > +{ > + return svdiv_m (svptrue_b64 (), svdup_u64 (0), op2); > +} > + > +/* > +** u64_m_ptrue_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_m_ptrue_op2 (svuint64_t op1) > +{ > + return svdiv_m (svptrue_b64 (), op1, svdup_u64 (0)); > +} > + > +/* > +** u64_m_ptrue_op1_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svuint64_t u64_m_ptrue_op1_op2 () > +{ > + return svdiv_m (svptrue_b64 (), svdup_u64 (0), svdup_u64 (0)); > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_fold_mul_zero.c > b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_mul_zero.c > new file mode 100644 > index 00000000000..793291449c1 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_mul_zero.c > @@ -0,0 +1,95 @@ > +/* { dg-final { check-function-bodies "**" "" } } */ > +/* { dg-options "-O2" } */ > + > +#include "arm_sve.h" > + > +/* > +** s64_x_pg_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_x_pg_op1 (svbool_t pg, svint64_t op2) > +{ > + return svmul_x (pg, svdup_s64 (0), op2); > +} > + > +/* > +** s64_z_pg_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_z_pg_op1 (svbool_t pg, svint64_t op2) > +{ > + return svdiv_z (pg, svdup_s64 (0), op2); > +} > + > +/* > +** s64_m_pg_op1: > +** mov z[0-9]+\.d, p[0-7]/z, #0 > +** ret > +*/ > +svint64_t s64_m_pg_op1 (svbool_t pg, svint64_t op2) > +{ > + return svdiv_m (pg, svdup_s64 (0), op2); > +} > + > +/* > +** s64_x_pg_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_x_pg_op2 (svbool_t pg, svint64_t op1) > +{ > + return svdiv_x (pg, op1, svdup_s64 (0)); > +} > + > +/* > +** s64_z_pg_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_z_pg_op2 (svbool_t pg, svint64_t op1) > +{ > + return svdiv_z (pg, op1, svdup_s64 (0)); > +} > + > +/* > +** s64_m_pg_op2: > +** mov (z[0-9]+)\.b, #0 > +** mul (z[0-9]+\.d), p[0-7]+/m, \2, \1\.d > +** ret > +*/ > +svint64_t s64_m_pg_op2 (svbool_t pg, svint64_t op1) > +{ > + return svdiv_m (pg, op1, svdup_s64 (0)); > +} > + > +/* > +** s64_m_ptrue_op1: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_m_ptrue_op1 (svint64_t op2) > +{ > + return svdiv_m (svptrue_b64 (), svdup_s64 (0), op2); > +} > + > +/* > +** s64_m_ptrue_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_m_ptrue_op2 (svint64_t op1) > +{ > + return svdiv_m (svptrue_b64 (), op1, svdup_s64 (0)); > +} > + > +/* > +** s64_m_ptrue_op1_op2: > +** mov z[0-9]+\.b, #0 > +** ret > +*/ > +svint64_t s64_m_ptrue_op1_op2 () > +{ > + return svdiv_m (svptrue_b64 (), svdup_s64 (0), svdup_s64 (0)); > +}