Jennifer Schmitz <jschm...@nvidia.com> writes:
> This patch implements constant folding of binary operations for SVE intrinsics
> by calling the constant-folding mechanism of the middle-end for a given
> tree_code.
> In fold-const.cc, the code for folding vector constants was moved from
> const_binop to a new function vector_const_binop. This function takes a
> function pointer as argument specifying how to fold the vector elements.
> The code for folding operations where the first operand is a vector
> constant and the second argument is an integer constant was also moved
> into vector_const_binop to fold binary SVE intrinsics where the second
> operand is an integer (_n).
> In the aarch64 backend, the new function aarch64_const_binop was
> created, which - in contrast to int_const_binop - does not treat operations as
> overflowing. This function is passed as callback to vector_const_binop
> during gimple folding in intrinsic implementations.
> Because aarch64_const_binop calls poly_int_binop, the latter was made public.
>
> The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
> OK for mainline?
>
> Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com>
>
> gcc/
>       * config/aarch64/aarch64-sve-builtins.cc (aarch64_const_binop):
>       New function to fold binary SVE intrinsics without overflow.
>       * config/aarch64/aarch64-sve-builtins.h: Declare aarch64_const_binop.
>       * fold-const.h: Declare vector_const_binop.
>       * fold-const.cc (const_binop): Remove cases for vector constants.
>       (vector_const_binop): New function that folds vector constants
>       element-wise.
>       (int_const_binop): Remove call to wide_int_binop.
>       (poly_int_binop): Add call to wide_int_binop.
>
> From 2a773d8289b5ec5ab2f2e0d03cbaa35b48bc44b2 Mon Sep 17 00:00:00 2001
> From: Jennifer Schmitz <jschm...@nvidia.com>
> Date: Thu, 29 Aug 2024 04:35:49 -0700
> Subject: [PATCH 1/3] SVE intrinsics: Fold constant operands.
>
> This patch implements constant folding of binary operations for SVE intrinsics
> by calling the constant-folding mechanism of the middle-end for a given
> tree_code.
> In fold-const.cc, the code for folding vector constants was moved from
> const_binop to a new function vector_const_binop. This function takes a
> function pointer as argument specifying how to fold the vector elements.
> The code for folding operations where the first operand is a vector
> constant and the second argument is an integer constant was also moved
> into vector_const_binop to fold binary SVE intrinsics where the second
> operand is an integer (_n).
> In the aarch64 backend, the new function aarch64_const_binop was
> created, which - in contrast to int_const_binop - does not treat operations as
> overflowing. This function is passed as callback to vector_const_binop
> during gimple folding in intrinsic implementations.
> Because aarch64_const_binop calls poly_int_binop, the latter was made public.
>
> The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
> OK for mainline?
>
> Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com>
>
> gcc/
>       * config/aarch64/aarch64-sve-builtins.cc (aarch64_const_binop):
>       New function to fold binary SVE intrinsics without overflow.
>       * config/aarch64/aarch64-sve-builtins.h: Declare aarch64_const_binop.
>       * fold-const.h: Declare vector_const_binop.
>       * fold-const.cc (const_binop): Remove cases for vector constants.
>       (vector_const_binop): New function that folds vector constants
>       element-wise.
>       (int_const_binop): Remove call to wide_int_binop.
>       (poly_int_binop): Add call to wide_int_binop.
> ---
>  gcc/config/aarch64/aarch64-sve-builtins.cc |  20 +++
>  gcc/config/aarch64/aarch64-sve-builtins.h  |   1 +
>  gcc/fold-const.cc                          | 192 +++++++++++----------
>  gcc/fold-const.h                           |   5 +
>  4 files changed, 131 insertions(+), 87 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc 
> b/gcc/config/aarch64/aarch64-sve-builtins.cc
> index 5ca9ec32b69..315d5ac4177 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
> @@ -3432,6 +3432,26 @@ is_ptrue (tree v, unsigned int step)
>         && vector_cst_all_same (v, step));
>  }
>  
> +/* Try to fold constant arguments arg1 and arg2 using the given tree_code.
> +   Operations are not treated as overflowing.  */
> +tree
> +aarch64_const_binop (enum tree_code code, tree arg1, tree arg2)
> +{
> +  if (poly_int_tree_p (arg1) && poly_int_tree_p (arg2))
> +    {
> +      poly_wide_int poly_res;
> +      tree type = TREE_TYPE (arg1);
> +      signop sign = TYPE_SIGN (type);
> +      wi::overflow_type overflow = wi::OVF_NONE;
> +
> +      if (!poly_int_binop (poly_res, code, arg1, arg2, sign, &overflow))
> +        return NULL_TREE;
> +      return force_fit_type (type, poly_res, false,
> +                             TREE_OVERFLOW (arg1) | TREE_OVERFLOW (arg2));
> +    }
> +  return NULL_TREE;
> +}
> +

This is ok, but for staging purposes, I think it should be part of
the later patches that call it, with the function marked "static".

>  gimple_folder::gimple_folder (const function_instance &instance, tree fndecl,
>                             gimple_stmt_iterator *gsi_in, gcall *call_in)
>    : function_call_info (gimple_location (call_in), instance, fndecl),
> @@ -1677,82 +1763,14 @@ const_binop (enum tree_code code, tree arg1, tree 
> arg2)
>        && (simplified = simplify_const_binop (code, arg2, arg1, 1)))
>      return simplified;
>  
> -  if (TREE_CODE (arg1) == VECTOR_CST
> -      && TREE_CODE (arg2) == VECTOR_CST
> -      && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)),
> -                TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg2))))
> -    {
> -      tree type = TREE_TYPE (arg1);
> -      bool step_ok_p;
> -      if (VECTOR_CST_STEPPED_P (arg1)
> -       && VECTOR_CST_STEPPED_P (arg2))
> -     /* We can operate directly on the encoding if:
> -
> -           a3 - a2 == a2 - a1 && b3 - b2 == b2 - b1
> -         implies
> -           (a3 op b3) - (a2 op b2) == (a2 op b2) - (a1 op b1)
> -
> -        Addition and subtraction are the supported operators
> -        for which this is true.  */
> -     step_ok_p = (code == PLUS_EXPR || code == MINUS_EXPR);
> -      else if (VECTOR_CST_STEPPED_P (arg1))
> -     /* We can operate directly on stepped encodings if:
> -
> -          a3 - a2 == a2 - a1
> -        implies:
> -          (a3 op c) - (a2 op c) == (a2 op c) - (a1 op c)
> -
> -        which is true if (x -> x op c) distributes over addition.  */
> -     step_ok_p = distributes_over_addition_p (code, 1);
> -      else
> -     /* Similarly in reverse.  */
> -     step_ok_p = distributes_over_addition_p (code, 2);
> -      tree_vector_builder elts;
> -      if (!elts.new_binary_operation (type, arg1, arg2, step_ok_p))
> -     return NULL_TREE;
> -      unsigned int count = elts.encoded_nelts ();
> -      for (unsigned int i = 0; i < count; ++i)
> -     {
> -       tree elem1 = VECTOR_CST_ELT (arg1, i);
> -       tree elem2 = VECTOR_CST_ELT (arg2, i);
> -
> -       tree elt = const_binop (code, elem1, elem2);
> +  if ((TREE_CODE (arg1) == VECTOR_CST
> +       && TREE_CODE (arg2) == VECTOR_CST
> +       && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)),
> +                 TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg2))))
> +      || (TREE_CODE (arg1) == VECTOR_CST
> +       && TREE_CODE (arg2) == INTEGER_CST))
> +    return vector_const_binop (code, arg1, arg2, const_binop);

Rather than do:

  if ((TREE_CODE (arg1) == VECTOR_CST
       && TREE_CODE (arg2) == VECTOR_CST
       && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)),
                    TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg2))))
      || (TREE_CODE (arg1) == VECTOR_CST
          && TREE_CODE (arg2) == INTEGER_CST))
    return vector_const_binop (code, arg1, arg2, const_binop);

  return NULL_TREE;

I think we should just tail-call vector_const_binop unconditionally:

  return vector_const_binop (code, arg1, arg2, const_binop);

That should be more efficient, and also protects against any risk
that the conditions would get out of sync.

OK for the target-independent bits with that change, if there is
no objection by this time on Monday.

Thanks,
Richard

>  
> -       /* It is possible that const_binop cannot handle the given
> -          code and return NULL_TREE */
> -       if (elt == NULL_TREE)
> -         return NULL_TREE;
> -       elts.quick_push (elt);
> -     }
> -
> -      return elts.build ();
> -    }
> -
> -  /* Shifts allow a scalar offset for a vector.  */
> -  if (TREE_CODE (arg1) == VECTOR_CST
> -      && TREE_CODE (arg2) == INTEGER_CST)
> -    {
> -      tree type = TREE_TYPE (arg1);
> -      bool step_ok_p = distributes_over_addition_p (code, 1);
> -      tree_vector_builder elts;
> -      if (!elts.new_unary_operation (type, arg1, step_ok_p))
> -     return NULL_TREE;
> -      unsigned int count = elts.encoded_nelts ();
> -      for (unsigned int i = 0; i < count; ++i)
> -     {
> -       tree elem1 = VECTOR_CST_ELT (arg1, i);
> -
> -       tree elt = const_binop (code, elem1, arg2);
> -
> -       /* It is possible that const_binop cannot handle the given
> -          code and return NULL_TREE.  */
> -       if (elt == NULL_TREE)
> -         return NULL_TREE;
> -       elts.quick_push (elt);
> -     }
> -
> -      return elts.build ();
> -    }
>    return NULL_TREE;
>  }
>  
> diff --git a/gcc/fold-const.h b/gcc/fold-const.h
> index b82ef137e2f..3e3998b57b0 100644
> --- a/gcc/fold-const.h
> +++ b/gcc/fold-const.h
> @@ -126,6 +126,9 @@ extern tree fold_vec_perm (tree, tree, tree, const 
> vec_perm_indices &);
>  extern bool wide_int_binop (wide_int &res, enum tree_code,
>                           const wide_int &arg1, const wide_int &arg2,
>                           signop, wi::overflow_type *);
> +extern bool poly_int_binop (poly_wide_int &res, enum tree_code,
> +                         const_tree, const_tree, signop,
> +                         wi::overflow_type *);
>  extern tree int_const_binop (enum tree_code, const_tree, const_tree, int = 
> 1);
>  #define build_fold_addr_expr(T)\
>          build_fold_addr_expr_loc (UNKNOWN_LOCATION, (T))
> @@ -218,6 +221,8 @@ extern bool simple_condition_p (tree);
>  extern tree exact_inverse (tree, tree);
>  extern bool expr_not_equal_to (tree t, const wide_int &);
>  extern tree const_unop (enum tree_code, tree, tree);
> +extern tree vector_const_binop (enum tree_code, tree, tree,
> +                             tree (*) (enum tree_code, tree, tree));
>  extern tree const_binop (enum tree_code, tree, tree, tree);
>  extern bool negate_mathfn_p (combined_fn);
>  extern const char *getbyterep (tree, unsigned HOST_WIDE_INT *);

Reply via email to