Jennifer Schmitz <jschm...@nvidia.com> writes:
> This patch folds svdiv where one of the operands is all-zeros to a zero
> vector, if the predicate is ptrue or the predication is _x or _z.
> This case was not covered by the recent patch that implemented constant
> folding, because that covered only cases where both operands are
> constant vectors. Here, the operation is folded as soon as one of the operands
> is a constant zero vector.
> Folding of divison by 0 to return 0 is in accordance with
> the semantics of sdiv and udiv.
>
> The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
> OK for mainline?
>
> Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com>
>
> gcc/
>       * config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl::fold):
>       Add folding of all-zero operands to zero vector.
>
> gcc/testsuite/
>       * gcc.target/aarch64/sve/fold_div_zero.c: New test.
>       * gcc.target/aarch64/sve/const_fold_div_1.c: Adjust expected
>       outcome.
>
> From 1d50cc57cd3bbe19a48b7bbb543ea331cbd9a6f6 Mon Sep 17 00:00:00 2001
> From: Jennifer Schmitz <jschm...@nvidia.com>
> Date: Mon, 2 Sep 2024 06:46:57 -0700
> Subject: [PATCH] SVE intrinsics: Fold svdiv with all-zero operands to zero
>  vector
>
> This patch folds svdiv where one of the operands is all-zeros to a zero
> vector, if the predicate is ptrue or the predication is _x or _z.
> This case was not covered by the recent patch that implemented constant
> folding, because that covered only cases where both operands are
> constant vectors. Here, the operation is folded as soon as one of the operands
> is a constant zero vector.
> Folding of divison by 0 to return 0 is in accordance with
> the semantics of sdiv and udiv.
>
> The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
> OK for mainline?
>
> Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com>
>
> gcc/
>       * config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl::fold):
>       Add folding of all-zero operands to zero vector.
>
> gcc/testsuite/
>       * gcc.target/aarch64/sve/fold_div_zero.c: New test.
>       * gcc.target/aarch64/sve/const_fold_div_1.c: Adjust expected
>       outcome.
> ---
>  .../aarch64/aarch64-sve-builtins-base.cc      |  38 +-
>  .../gcc.target/aarch64/sve/const_fold_div_1.c |  12 +-
>  .../gcc.target/aarch64/sve/fold_div_zero.c    | 369 ++++++++++++++++++
>  3 files changed, 402 insertions(+), 17 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/fold_div_zero.c
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index 6c94d144dc9..3ec9ebbf6ef 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -758,30 +758,50 @@ public:
>      if (auto *res = f.fold_const_binary (TRUNC_DIV_EXPR))
>        return res;
>  
> -    /* If the divisor is a uniform power of 2, fold to a shift
> -       instruction.  */
> +    tree pg = gimple_call_arg (f.call, 0);
> +    tree op1 = gimple_call_arg (f.call, 1);
>      tree op2 = gimple_call_arg (f.call, 2);
> -    tree divisor_cst = uniform_integer_cst_p (op2);
> +    bool pred_fold = f.pred != PRED_m
> +                  || is_ptrue (pg, f.type_suffix (0).element_bytes);
>  
> -    if (!divisor_cst || !integer_pow2p (divisor_cst))
> +    /* If the dividend is all zeros, fold to zero vector.  */
> +    tree op1_cst = uniform_integer_cst_p (op1);
> +    if (op1_cst && pred_fold && integer_zerop (op1_cst))
> +      return gimple_build_assign (f.lhs, op1);

This fold is ok for all predication types, since _m merges with
the first input.  There's also no need to apply uniform_integer_cst_p
manually, since integer_zerop handles vectors too.  So I think this can be:

    /* If the dividend is all zeros, fold to zero vector.  */
    if (integer_zerop (op1))
      return gimple_build_assign (f.lhs, op1);

(The new _m cases would need tests though!)

> +
> +    /* If the divisor is all zeros, fold to zero vector.  */
> +    tree op2_cst = uniform_integer_cst_p (op2);
> +    if (!op2_cst)
>        return NULL;
>  
> +    if (pred_fold && integer_zerop (op2_cst))
> +      {
> +     gimple_seq stmts = NULL;
> +     tree op2_vec = f.force_vector (stmts, TREE_TYPE (op1), op2);
> +     gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
> +     return gimple_build_assign (f.lhs, op2_vec);
> +      }

This would be simpler as:

    if (integer_zerop (op2_cst)
        && (f.pred != PRED_m
            || is_ptrue (pg, f.type_suffix (0).element_bytes)))
      return gimple_build_assign (f.lhs, build_zero_cst (TREE_TYPE (f.lhs)));

(I've dropped the pred_fold variable, since it is only valid for
things that fold to zero.  For everything else we'd need == PRED_x
instead.)

> +
> +    /* If the divisor is a uniform power of 2, fold to a shift
> +       instruction.  */
> +    if (!integer_pow2p (op2_cst))
> +      return NULL;
>      tree new_divisor;
>      gcall *call;

Very minor nit, but: given the line spacing in the function, I think
it'd look better to have a blank line after the return.

Thanks,
Richard

> -    if (f.type_suffix (0).unsigned_p && tree_to_uhwi (divisor_cst) != 1)
> +    if (f.type_suffix (0).unsigned_p && tree_to_uhwi (op2_cst) != 1)
>        {
>       function_instance instance ("svlsr", functions::svlsr,
>                                   shapes::binary_uint_opt_n, MODE_n,
>                                   f.type_suffix_ids, GROUP_none, f.pred);
>       call = f.redirect_call (instance);
> -     tree d = INTEGRAL_TYPE_P (TREE_TYPE (op2)) ? op2 : divisor_cst;
> +     tree d = INTEGRAL_TYPE_P (TREE_TYPE (op2)) ? op2 : op2_cst;
>       new_divisor = wide_int_to_tree (TREE_TYPE (d), tree_log2 (d));
>        }
>      else
>        {
> -     if (tree_int_cst_sign_bit (divisor_cst)
> -         || tree_to_shwi (divisor_cst) == 1)
> +     if (tree_int_cst_sign_bit (op2_cst)
> +         || tree_to_shwi (op2_cst) == 1)
>         return NULL;
>  
>       function_instance instance ("svasrd", functions::svasrd,
> @@ -789,7 +809,7 @@ public:
>                                   f.type_suffix_ids, GROUP_none, f.pred);
>       call = f.redirect_call (instance);
>       new_divisor = wide_int_to_tree (scalar_types[VECTOR_TYPE_svuint64_t],
> -                                     tree_log2 (divisor_cst));
> +                                     tree_log2 (op2_cst));
>        }
>  
>      gimple_call_set_arg (call, 2, new_divisor);
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c
> index c15b3fc3aa0..92e0005c0fe 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c
> @@ -45,7 +45,7 @@ svint64_t s64_z_pg (svbool_t pg)
>  
>  /*
>  ** s64_z_pg_0:
> -**   mov     z[0-9]+\.d, p[0-7]/z, #0
> +**   mov     z[0-9]+\.b, #0
>  **   ret
>  */
>  svint64_t s64_z_pg_0 (svbool_t pg)
> @@ -55,9 +55,7 @@ svint64_t s64_z_pg_0 (svbool_t pg)
>  
>  /*
>  ** s64_z_pg_by0:
> -**   mov     (z[0-9]+\.d), #5
> -**   mov     (z[0-9]+)\.b, #0
> -**   sdivr   \2\.d, p[0-7]/m, \2\.d, \1
> +**   mov     z[0-9]+\.b, #0
>  **   ret
>  */
>  svint64_t s64_z_pg_by0 (svbool_t pg)
> @@ -149,7 +147,7 @@ svint64_t s64_z_pg_n (svbool_t pg)
>  
>  /*
>  ** s64_z_pg_n_s64_0:
> -**   mov     z[0-9]+\.d, p[0-7]/z, #0
> +**   mov     z[0-9]+\.b, #0
>  **   ret
>  */
>  svint64_t s64_z_pg_n_s64_0 (svbool_t pg)
> @@ -159,9 +157,7 @@ svint64_t s64_z_pg_n_s64_0 (svbool_t pg)
>  
>  /*
>  ** s64_z_pg_n_s64_by0:
> -**   mov     (z[0-9]+\.d), #5
> -**   mov     (z[0-9]+)\.b, #0
> -**   sdivr   \2\.d, p[0-7]/m, \2\.d, \1
> +**   mov     z[0-9]+\.b, #0
>  **   ret
>  */
>  svint64_t s64_z_pg_n_s64_by0 (svbool_t pg)
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_div_zero.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/fold_div_zero.c
> new file mode 100644
> index 00000000000..be4a7353da0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_div_zero.c
> @@ -0,0 +1,369 @@
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* { dg-options "-O2" } */
> +
> +#include "arm_sve.h"
> +
> +/*
> +** s64_x_pg_op1:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svint64_t s64_x_pg_op1 (svbool_t pg, svint64_t op2)
> +{
> +  return svdiv_x (pg, svdup_s64 (0), op2);
> +}
> +
> +/*
> +** s64_z_pg_op1:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svint64_t s64_z_pg_op1 (svbool_t pg, svint64_t op2)
> +{
> +  return svdiv_z (pg, svdup_s64 (0), op2);
> +}
> +
> +/*
> +** s64_m_pg_op1:
> +**   mov     z[0-9]+\.d, p[0-7]/z, #0
> +**   ret
> +*/
> +svint64_t s64_m_pg_op1 (svbool_t pg, svint64_t op2)
> +{
> +  return svdiv_m (pg, svdup_s64 (0), op2);
> +}
> +
> +/*
> +** s64_x_ptrue_op1:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svint64_t s64_x_ptrue_op1 (svint64_t op2)
> +{
> +  return svdiv_x (svptrue_b64 (), svdup_s64 (0), op2);
> +}
> +
> +/*
> +** s64_z_ptrue_op1:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svint64_t s64_z_ptrue_op1 (svint64_t op2)
> +{
> +  return svdiv_z (svptrue_b64 (), svdup_s64 (0), op2);
> +}
> +
> +/*
> +** s64_m_ptrue_op1:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svint64_t s64_m_ptrue_op1 (svint64_t op2)
> +{
> +  return svdiv_m (svptrue_b64 (), svdup_s64 (0), op2);
> +}
> +
> +/*
> +** s64_x_pg_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svint64_t s64_x_pg_op2 (svbool_t pg, svint64_t op1)
> +{
> +  return svdiv_x (pg, op1, svdup_s64 (0));
> +}
> +
> +/*
> +** s64_z_pg_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svint64_t s64_z_pg_op2 (svbool_t pg, svint64_t op1)
> +{
> +  return svdiv_z (pg, op1, svdup_s64 (0));
> +}
> +
> +/*
> +** s64_m_pg_op2:
> +**   mov     (z[0-9]+)\.b, #0
> +**   sdiv    (z[0-9]\.d), p[0-7]/m, \2, \1\.d
> +**   ret
> +*/
> +svint64_t s64_m_pg_op2 (svbool_t pg, svint64_t op1)
> +{
> +  return svdiv_m (pg, op1, svdup_s64 (0));
> +}
> +
> +/*
> +** s64_x_ptrue_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svint64_t s64_x_ptrue_op2 (svint64_t op1)
> +{
> +  return svdiv_x (svptrue_b64 (), op1, svdup_s64 (0));
> +}
> +
> +/*
> +** s64_z_ptrue_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svint64_t s64_z_ptrue_op2 (svint64_t op1)
> +{
> +  return svdiv_z (svptrue_b64 (), op1, svdup_s64 (0));
> +}
> +
> +/*
> +** s64_m_ptrue_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svint64_t s64_m_ptrue_op2 (svint64_t op1)
> +{
> +  return svdiv_m (svptrue_b64 (), op1, svdup_s64 (0));
> +}
> +
> +/*
> +** s64_n_x_pg_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svint64_t s64_n_x_pg_op2 (svbool_t pg, svint64_t op1)
> +{
> +  return svdiv_n_s64_x (pg, op1, 0);
> +}
> +
> +/*
> +** s64_n_z_pg_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svint64_t s64_n_z_pg_op2 (svbool_t pg, svint64_t op1)
> +{
> +  return svdiv_n_s64_z (pg, op1, 0);
> +}
> +
> +/*
> +** s64_n_m_pg_op2:
> +**   mov     (z[0-9]+)\.b, #0
> +**   sdiv    (z[0-9]+\.d), p[0-7]/m, \2, \1\.d
> +**   ret
> +*/
> +svint64_t s64_n_m_pg_op2 (svbool_t pg, svint64_t op1)
> +{
> +  return svdiv_n_s64_m (pg, op1, 0);
> +}
> +
> +/*
> +** s64_n_x_ptrue_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svint64_t s64_n_x_ptrue_op2 (svint64_t op1)
> +{
> +  return svdiv_n_s64_x (svptrue_b64 (), op1, 0);
> +}
> +
> +/*
> +** s64_n_z_ptrue_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svint64_t s64_n_z_ptrue_op2 (svint64_t op1)
> +{
> +  return svdiv_n_s64_z (svptrue_b64 (), op1, 0);
> +}
> +
> +/*
> +** s64_n_m_ptrue_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svint64_t s64_n_m_ptrue_op2 (svint64_t op1)
> +{
> +  return svdiv_n_s64_m (svptrue_b64 (), op1, 0);
> +}
> +
> +/*
> +** u64_x_pg_op1:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svuint64_t u64_x_pg_op1 (svbool_t pg, svuint64_t op2)
> +{
> +  return svdiv_x (pg, svdup_u64 (0), op2);
> +}
> +
> +/*
> +** u64_z_pg_op1:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svuint64_t u64_z_pg_op1 (svbool_t pg, svuint64_t op2)
> +{
> +  return svdiv_z (pg, svdup_u64 (0), op2);
> +}
> +
> +/*
> +** u64_m_pg_op1:
> +**   mov     z[0-9]+\.d, p[0-7]/z, #0
> +**   ret
> +*/
> +svuint64_t u64_m_pg_op1 (svbool_t pg, svuint64_t op2)
> +{
> +  return svdiv_m (pg, svdup_u64 (0), op2);
> +}
> +
> +/*
> +** u64_x_ptrue_op1:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svuint64_t u64_x_ptrue_op1 (svuint64_t op2)
> +{
> +  return svdiv_x (svptrue_b64 (), svdup_u64 (0), op2);
> +}
> +
> +/*
> +** u64_z_ptrue_op1:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svuint64_t u64_z_ptrue_op1 (svuint64_t op2)
> +{
> +  return svdiv_z (svptrue_b64 (), svdup_u64 (0), op2);
> +}
> +
> +/*
> +** u64_m_ptrue_op1:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svuint64_t u64_m_ptrue_op1 (svuint64_t op2)
> +{
> +  return svdiv_m (svptrue_b64 (), svdup_u64 (0), op2);
> +}
> +
> +/*
> +** u64_x_pg_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svuint64_t u64_x_pg_op2 (svbool_t pg, svuint64_t op1)
> +{
> +  return svdiv_x (pg, op1, svdup_u64 (0));
> +}
> +
> +/*
> +** u64_z_pg_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svuint64_t u64_z_pg_op2 (svbool_t pg, svuint64_t op1)
> +{
> +  return svdiv_z (pg, op1, svdup_u64 (0));
> +}
> +
> +/*
> +** u64_m_pg_op2:
> +**   mov     (z[0-9]+)\.b, #0
> +**   udiv    (z[0-9]+\.d), p[0-7]/m, \2, \1\.d
> +**   ret
> +*/
> +svuint64_t u64_m_pg_op2 (svbool_t pg, svuint64_t op1)
> +{
> +  return svdiv_m (pg, op1, svdup_u64 (0));
> +}
> +
> +/*
> +** u64_x_ptrue_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svuint64_t u64_x_ptrue_op2 (svuint64_t op1)
> +{
> +  return svdiv_x (svptrue_b64 (), op1, svdup_u64 (0));
> +}
> +
> +/*
> +** u64_z_ptrue_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svuint64_t u64_z_ptrue_op2 (svuint64_t op1)
> +{
> +  return svdiv_z (svptrue_b64 (), op1, svdup_u64 (0));
> +}
> +
> +/*
> +** u64_m_ptrue_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svuint64_t u64_m_ptrue_op2 (svuint64_t op1)
> +{
> +  return svdiv_m (svptrue_b64 (), op1, svdup_u64 (0));
> +}
> +
> +/*
> +** u64_n_x_pg_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svuint64_t u64_n_x_pg_op2 (svbool_t pg, svuint64_t op1)
> +{
> +  return svdiv_n_u64_x (pg, op1, 0);
> +}
> +
> +/*
> +** u64_n_z_pg_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svuint64_t u64_n_z_pg_op2 (svbool_t pg, svuint64_t op1)
> +{
> +  return svdiv_n_u64_z (pg, op1, 0);
> +}
> +
> +/*
> +** u64_n_m_pg_op2:
> +**   mov     (z[0-9]+)\.b, #0
> +**   udiv    (z[0-9]+\.d), p[0-7]/m, \2, \1\.d
> +**   ret
> +*/
> +svuint64_t u64_n_m_pg_op2 (svbool_t pg, svuint64_t op1)
> +{
> +  return svdiv_n_u64_m (pg, op1, 0);
> +}
> +
> +/*
> +** u64_n_x_ptrue_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svuint64_t u64_n_x_ptrue_op2 (svuint64_t op1)
> +{
> +  return svdiv_n_u64_x (svptrue_b64 (), op1, 0);
> +}
> +
> +/*
> +** u64_n_z_ptrue_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svuint64_t u64_n_z_ptrue_op2 (svuint64_t op1)
> +{
> +  return svdiv_n_u64_z (svptrue_b64 (), op1, 0);
> +}
> +
> +/*
> +** u64_n_m_ptrue_op2:
> +**   mov     z[0-9]+\.b, #0
> +**   ret
> +*/
> +svuint64_t u64_n_m_ptrue_op2 (svuint64_t op1)
> +{
> +  return svdiv_n_u64_m (svptrue_b64 (), op1, 0);
> +}
> +

Reply via email to