https://gcc.gnu.org/g:ee8b7231b03a36dfc09d94f2b663636ca2a36daf
commit r15-3396-gee8b7231b03a36dfc09d94f2b663636ca2a36daf Author: Jennifer Schmitz <jschm...@nvidia.com> Date: Fri Aug 30 07:03:49 2024 -0700 SVE intrinsics: Fold constant operands for svdiv. This patch implements constant folding for svdiv: The new function aarch64_const_binop was created, which - in contrast to int_const_binop - does not treat operations as overflowing. This function is passed as callback to vector_const_binop from the new gimple_folder method fold_const_binary, if the predicate is ptrue or predication is _x. From svdiv_impl::fold, fold_const_binary is called with TRUNC_DIV_EXPR as tree_code. In aarch64_const_binop, a case was added for TRUNC_DIV_EXPR to return 0 for division by 0, as defined in the semantics for svdiv. Tests were added to check the produced assembly for different predicates, signed and unsigned integers, and the svdiv_n_* case. The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. OK for mainline? Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> gcc/ * config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl::fold): Try constant folding. * config/aarch64/aarch64-sve-builtins.h: Declare gimple_folder::fold_const_binary. * config/aarch64/aarch64-sve-builtins.cc (aarch64_const_binop): New function to fold binary SVE intrinsics without overflow. (gimple_folder::fold_const_binary): New helper function for constant folding of SVE intrinsics. gcc/testsuite/ * gcc.target/aarch64/sve/const_fold_div_1.c: New test. Diff: --- gcc/config/aarch64/aarch64-sve-builtins-base.cc | 11 +- gcc/config/aarch64/aarch64-sve-builtins.cc | 43 +++ gcc/config/aarch64/aarch64-sve-builtins.h | 1 + .../gcc.target/aarch64/sve/const_fold_div_1.c | 358 +++++++++++++++++++++ 4 files changed, 410 insertions(+), 3 deletions(-) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index d55bee0b72fa..6c94d144dc9c 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -755,8 +755,13 @@ public: gimple * fold (gimple_folder &f) const override { - tree divisor = gimple_call_arg (f.call, 2); - tree divisor_cst = uniform_integer_cst_p (divisor); + if (auto *res = f.fold_const_binary (TRUNC_DIV_EXPR)) + return res; + + /* If the divisor is a uniform power of 2, fold to a shift + instruction. */ + tree op2 = gimple_call_arg (f.call, 2); + tree divisor_cst = uniform_integer_cst_p (op2); if (!divisor_cst || !integer_pow2p (divisor_cst)) return NULL; @@ -770,7 +775,7 @@ public: shapes::binary_uint_opt_n, MODE_n, f.type_suffix_ids, GROUP_none, f.pred); call = f.redirect_call (instance); - tree d = INTEGRAL_TYPE_P (TREE_TYPE (divisor)) ? divisor : divisor_cst; + tree d = INTEGRAL_TYPE_P (TREE_TYPE (op2)) ? op2 : divisor_cst; new_divisor = wide_int_to_tree (TREE_TYPE (d), tree_log2 (d)); } else diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc index 5ca9ec32b691..8f9aa3cf1207 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc @@ -1132,6 +1132,30 @@ report_not_enum (location_t location, tree fndecl, unsigned int argno, " a valid %qT value", actual, argno + 1, fndecl, enumtype); } +/* Try to fold constant arguments ARG1 and ARG2 using the given tree_code. + Operations are not treated as overflowing. */ +static tree +aarch64_const_binop (enum tree_code code, tree arg1, tree arg2) +{ + if (poly_int_tree_p (arg1) && poly_int_tree_p (arg2)) + { + poly_wide_int poly_res; + tree type = TREE_TYPE (arg1); + signop sign = TYPE_SIGN (type); + wi::overflow_type overflow = wi::OVF_NONE; + + /* Return 0 for division by 0, like SDIV and UDIV do. */ + if (code == TRUNC_DIV_EXPR && integer_zerop (arg2)) + return arg2; + + if (!poly_int_binop (poly_res, code, arg1, arg2, sign, &overflow)) + return NULL_TREE; + return force_fit_type (type, poly_res, false, + TREE_OVERFLOW (arg1) | TREE_OVERFLOW (arg2)); + } + return NULL_TREE; +} + /* Return a hash code for a function_instance. */ hashval_t function_instance::hash () const @@ -3593,6 +3617,25 @@ gimple_folder::fold_to_vl_pred (unsigned int vl) return gimple_build_assign (lhs, builder.build ()); } +/* Try to fold the call to a constant, given that, for integers, the call + is roughly equivalent to binary operation CODE. aarch64_const_binop + handles any differences between CODE and the intrinsic. */ +gimple * +gimple_folder::fold_const_binary (enum tree_code code) +{ + gcc_assert (gimple_call_num_args (call) == 3); + tree pg = gimple_call_arg (call, 0); + tree op1 = gimple_call_arg (call, 1); + tree op2 = gimple_call_arg (call, 2); + + if (type_suffix (0).integer_p + && (pred == PRED_x || is_ptrue (pg, type_suffix (0).element_bytes))) + if (tree res = vector_const_binop (code, op1, op2, aarch64_const_binop)) + return gimple_build_assign (lhs, res); + + return NULL; +} + /* Try to fold the call. Return the new statement on success and null on failure. */ gimple * diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h b/gcc/config/aarch64/aarch64-sve-builtins.h index 9ab6f202c306..e3880503da02 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.h +++ b/gcc/config/aarch64/aarch64-sve-builtins.h @@ -636,6 +636,7 @@ public: gimple *fold_to_pfalse (); gimple *fold_to_ptrue (); gimple *fold_to_vl_pred (unsigned int); + gimple *fold_const_binary (enum tree_code); gimple *fold (); diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c new file mode 100644 index 000000000000..c15b3fc3aa0a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c @@ -0,0 +1,358 @@ +/* { dg-final { check-function-bodies "**" "" } } */ +/* { dg-options "-O2" } */ + +#include "arm_sve.h" + +/* +** s64_x_pg: +** mov z[0-9]+\.d, #1 +** ret +*/ +svint64_t s64_x_pg (svbool_t pg) +{ + return svdiv_x (pg, svdup_s64 (5), svdup_s64 (3)); +} + +/* +** s64_x_pg_0: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_x_pg_0 (svbool_t pg) +{ + return svdiv_x (pg, svdup_s64 (0), svdup_s64 (3)); +} + +/* +** s64_x_pg_by0: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_x_pg_by0 (svbool_t pg) +{ + return svdiv_x (pg, svdup_s64 (5), svdup_s64 (0)); +} + +/* +** s64_z_pg: +** mov z[0-9]+\.d, p[0-7]/z, #1 +** ret +*/ +svint64_t s64_z_pg (svbool_t pg) +{ + return svdiv_z (pg, svdup_s64 (5), svdup_s64 (3)); +} + +/* +** s64_z_pg_0: +** mov z[0-9]+\.d, p[0-7]/z, #0 +** ret +*/ +svint64_t s64_z_pg_0 (svbool_t pg) +{ + return svdiv_z (pg, svdup_s64 (0), svdup_s64 (3)); +} + +/* +** s64_z_pg_by0: +** mov (z[0-9]+\.d), #5 +** mov (z[0-9]+)\.b, #0 +** sdivr \2\.d, p[0-7]/m, \2\.d, \1 +** ret +*/ +svint64_t s64_z_pg_by0 (svbool_t pg) +{ + return svdiv_z (pg, svdup_s64 (5), svdup_s64 (0)); +} + +/* +** s64_m_pg: +** mov (z[0-9]+\.d), #3 +** mov (z[0-9]+\.d), #5 +** sdiv \2, p[0-7]/m, \2, \1 +** ret +*/ +svint64_t s64_m_pg (svbool_t pg) +{ + return svdiv_m (pg, svdup_s64 (5), svdup_s64 (3)); +} + +/* +** s64_x_ptrue: +** mov z[0-9]+\.d, #1 +** ret +*/ +svint64_t s64_x_ptrue () +{ + return svdiv_x (svptrue_b64 (), svdup_s64 (5), svdup_s64 (3)); +} + +/* +** s64_z_ptrue: +** mov z[0-9]+\.d, #1 +** ret +*/ +svint64_t s64_z_ptrue () +{ + return svdiv_z (svptrue_b64 (), svdup_s64 (5), svdup_s64 (3)); +} + +/* +** s64_m_ptrue: +** mov z[0-9]+\.d, #1 +** ret +*/ +svint64_t s64_m_ptrue () +{ + return svdiv_m (svptrue_b64 (), svdup_s64 (5), svdup_s64 (3)); +} + +/* +** s64_x_pg_n: +** mov z[0-9]+\.d, #1 +** ret +*/ +svint64_t s64_x_pg_n (svbool_t pg) +{ + return svdiv_n_s64_x (pg, svdup_s64 (5), 3); +} + +/* +** s64_x_pg_n_s64_0: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_x_pg_n_s64_0 (svbool_t pg) +{ + return svdiv_n_s64_x (pg, svdup_s64 (0), 3); +} + +/* +** s64_x_pg_n_s64_by0: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint64_t s64_x_pg_n_s64_by0 (svbool_t pg) +{ + return svdiv_n_s64_x (pg, svdup_s64 (5), 0); +} + +/* +** s64_z_pg_n: +** mov z[0-9]+\.d, p[0-7]/z, #1 +** ret +*/ +svint64_t s64_z_pg_n (svbool_t pg) +{ + return svdiv_n_s64_z (pg, svdup_s64 (5), 3); +} + +/* +** s64_z_pg_n_s64_0: +** mov z[0-9]+\.d, p[0-7]/z, #0 +** ret +*/ +svint64_t s64_z_pg_n_s64_0 (svbool_t pg) +{ + return svdiv_n_s64_z (pg, svdup_s64 (0), 3); +} + +/* +** s64_z_pg_n_s64_by0: +** mov (z[0-9]+\.d), #5 +** mov (z[0-9]+)\.b, #0 +** sdivr \2\.d, p[0-7]/m, \2\.d, \1 +** ret +*/ +svint64_t s64_z_pg_n_s64_by0 (svbool_t pg) +{ + return svdiv_n_s64_z (pg, svdup_s64 (5), 0); +} + +/* +** s64_m_pg_n: +** mov (z[0-9]+\.d), #3 +** mov (z[0-9]+\.d), #5 +** sdiv \2, p[0-7]/m, \2, \1 +** ret +*/ +svint64_t s64_m_pg_n (svbool_t pg) +{ + return svdiv_n_s64_m (pg, svdup_s64 (5), 3); +} + +/* +** s64_x_ptrue_n: +** mov z[0-9]+\.d, #1 +** ret +*/ +svint64_t s64_x_ptrue_n () +{ + return svdiv_n_s64_x (svptrue_b64 (), svdup_s64 (5), 3); +} + +/* +** s64_z_ptrue_n: +** mov z[0-9]+\.d, #1 +** ret +*/ +svint64_t s64_z_ptrue_n () +{ + return svdiv_n_s64_z (svptrue_b64 (), svdup_s64 (5), 3); +} + +/* +** s64_m_ptrue_n: +** mov z[0-9]+\.d, #1 +** ret +*/ +svint64_t s64_m_ptrue_n () +{ + return svdiv_n_s64_m (svptrue_b64 (), svdup_s64 (5), 3); +} + +/* +** s32_m_ptrue_dupq: +** mov z[0-9]+\.b, #0 +** ret +*/ +svint32_t s32_m_ptrue_dupq () +{ + return svdiv_s32_m (svptrue_b32 (), svdupq_s32 (3, 0, -5, 11), + svdupq_s32 (4, 1, -6, 0)); +} + +/* +** s32_z_ptrue_dupq: +** mov z[0-9]+\.s, #-2 +** ret +*/ +svint32_t s32_z_ptrue_dupq () +{ + return svdiv_s32_z (svptrue_b32 (), svdupq_s32 (6, -30, 100, -4), + svdupq_s32 (-3, 15, -50, 2)); +} + +/* +** u64_x_pg: +** mov z[0-9]+\.d, #1 +** ret +*/ +svuint64_t u64_x_pg (svbool_t pg) +{ + return svdiv_x (pg, svdup_u64 (5), svdup_u64 (3)); +} + +/* +** u64_z_pg: +** mov z[0-9]+\.d, p[0-7]/z, #1 +** ret +*/ +svuint64_t u64_z_pg (svbool_t pg) +{ + return svdiv_z (pg, svdup_u64 (5), svdup_u64 (3)); +} + +/* +** u64_m_pg: +** mov (z[0-9]+\.d), #3 +** mov (z[0-9]+\.d), #5 +** udiv \2, p[0-7]/m, \2, \1 +** ret +*/ +svuint64_t u64_m_pg (svbool_t pg) +{ + return svdiv_m (pg, svdup_u64 (5), svdup_u64 (3)); +} + +/* +** u64_x_ptrue: +** mov z[0-9]+\.d, #1 +** ret +*/ +svuint64_t u64_x_ptrue () +{ + return svdiv_x (svptrue_b64 (), svdup_u64 (5), svdup_u64 (3)); +} + +/* +** u64_z_ptrue: +** mov z[0-9]+\.d, #1 +** ret +*/ +svuint64_t u64_z_ptrue () +{ + return svdiv_z (svptrue_b64 (), svdup_u64 (5), svdup_u64 (3)); +} + +/* +** u64_m_ptrue: +** mov z[0-9]+\.d, #1 +** ret +*/ +svuint64_t u64_m_ptrue () +{ + return svdiv_m (svptrue_b64 (), svdup_u64 (5), svdup_u64 (3)); +} + +/* +** u64_x_pg_n: +** mov z[0-9]+\.d, #1 +** ret +*/ +svuint64_t u64_x_pg_n (svbool_t pg) +{ + return svdiv_n_u64_x (pg, svdup_u64 (5), 3); +} + +/* +** u64_z_pg_n: +** mov z[0-9]+\.d, p[0-7]/z, #1 +** ret +*/ +svuint64_t u64_z_pg_n (svbool_t pg) +{ + return svdiv_n_u64_z (pg, svdup_u64 (5), 3); +} + +/* +** u64_m_pg_n: +** mov (z[0-9]+\.d), #3 +** mov (z[0-9]+\.d), #5 +** udiv \2, p[0-7]/m, \2, \1 +** ret +*/ +svuint64_t u64_m_pg_n (svbool_t pg) +{ + return svdiv_n_u64_m (pg, svdup_u64 (5), 3); +} + +/* +** u64_x_ptrue_n: +** mov z[0-9]+\.d, #1 +** ret +*/ +svuint64_t u64_x_ptrue_n () +{ + return svdiv_n_u64_x (svptrue_b64 (), svdup_u64 (5), 3); +} + +/* +** u64_z_ptrue_n: +** mov z[0-9]+\.d, #1 +** ret +*/ +svuint64_t u64_z_ptrue_n () +{ + return svdiv_n_u64_z (svptrue_b64 (), svdup_u64 (5), 3); +} + +/* +** u64_m_ptrue_n: +** mov z[0-9]+\.d, #1 +** ret +*/ +svuint64_t u64_m_ptrue_n () +{ + return svdiv_n_u64_m (svptrue_b64 (), svdup_u64 (5), 3); +}