Jennifer Schmitz <jschm...@nvidia.com> writes: > This patch implements constant folding of binary operations for SVE intrinsics > by calling the constant-folding mechanism of the middle-end for a given > tree_code. > In fold-const.cc, the code for folding vector constants was moved from > const_binop to a new function vector_const_binop. This function takes a > function pointer as argument specifying how to fold the vector elements. > The code for folding operations where the first operand is a vector > constant and the second argument is an integer constant was also moved > into vector_const_binop to fold binary SVE intrinsics where the second > operand is an integer (_n). > In the aarch64 backend, the new function aarch64_const_binop was > created, which - in contrast to int_const_binop - does not treat operations as > overflowing. This function is passed as callback to vector_const_binop > during gimple folding in intrinsic implementations. > Because aarch64_const_binop calls poly_int_binop, the latter was made public. > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. > OK for mainline? > > Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> > > gcc/ > * config/aarch64/aarch64-sve-builtins.cc (aarch64_const_binop): > New function to fold binary SVE intrinsics without overflow. > * config/aarch64/aarch64-sve-builtins.h: Declare aarch64_const_binop. > * fold-const.h: Declare vector_const_binop. > * fold-const.cc (const_binop): Remove cases for vector constants. > (vector_const_binop): New function that folds vector constants > element-wise. > (int_const_binop): Remove call to wide_int_binop. > (poly_int_binop): Add call to wide_int_binop. > > From 2a773d8289b5ec5ab2f2e0d03cbaa35b48bc44b2 Mon Sep 17 00:00:00 2001 > From: Jennifer Schmitz <jschm...@nvidia.com> > Date: Thu, 29 Aug 2024 04:35:49 -0700 > Subject: [PATCH 1/3] SVE intrinsics: Fold constant operands. > > This patch implements constant folding of binary operations for SVE intrinsics > by calling the constant-folding mechanism of the middle-end for a given > tree_code. > In fold-const.cc, the code for folding vector constants was moved from > const_binop to a new function vector_const_binop. This function takes a > function pointer as argument specifying how to fold the vector elements. > The code for folding operations where the first operand is a vector > constant and the second argument is an integer constant was also moved > into vector_const_binop to fold binary SVE intrinsics where the second > operand is an integer (_n). > In the aarch64 backend, the new function aarch64_const_binop was > created, which - in contrast to int_const_binop - does not treat operations as > overflowing. This function is passed as callback to vector_const_binop > during gimple folding in intrinsic implementations. > Because aarch64_const_binop calls poly_int_binop, the latter was made public. > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. > OK for mainline? > > Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> > > gcc/ > * config/aarch64/aarch64-sve-builtins.cc (aarch64_const_binop): > New function to fold binary SVE intrinsics without overflow. > * config/aarch64/aarch64-sve-builtins.h: Declare aarch64_const_binop. > * fold-const.h: Declare vector_const_binop. > * fold-const.cc (const_binop): Remove cases for vector constants. > (vector_const_binop): New function that folds vector constants > element-wise. > (int_const_binop): Remove call to wide_int_binop. > (poly_int_binop): Add call to wide_int_binop. > --- > gcc/config/aarch64/aarch64-sve-builtins.cc | 20 +++ > gcc/config/aarch64/aarch64-sve-builtins.h | 1 + > gcc/fold-const.cc | 192 +++++++++++---------- > gcc/fold-const.h | 5 + > 4 files changed, 131 insertions(+), 87 deletions(-) > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc > b/gcc/config/aarch64/aarch64-sve-builtins.cc > index 5ca9ec32b69..315d5ac4177 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc > @@ -3432,6 +3432,26 @@ is_ptrue (tree v, unsigned int step) > && vector_cst_all_same (v, step)); > } > > +/* Try to fold constant arguments arg1 and arg2 using the given tree_code. > + Operations are not treated as overflowing. */ > +tree > +aarch64_const_binop (enum tree_code code, tree arg1, tree arg2) > +{ > + if (poly_int_tree_p (arg1) && poly_int_tree_p (arg2)) > + { > + poly_wide_int poly_res; > + tree type = TREE_TYPE (arg1); > + signop sign = TYPE_SIGN (type); > + wi::overflow_type overflow = wi::OVF_NONE; > + > + if (!poly_int_binop (poly_res, code, arg1, arg2, sign, &overflow)) > + return NULL_TREE; > + return force_fit_type (type, poly_res, false, > + TREE_OVERFLOW (arg1) | TREE_OVERFLOW (arg2)); > + } > + return NULL_TREE; > +} > +
This is ok, but for staging purposes, I think it should be part of the later patches that call it, with the function marked "static". > gimple_folder::gimple_folder (const function_instance &instance, tree fndecl, > gimple_stmt_iterator *gsi_in, gcall *call_in) > : function_call_info (gimple_location (call_in), instance, fndecl), > @@ -1677,82 +1763,14 @@ const_binop (enum tree_code code, tree arg1, tree > arg2) > && (simplified = simplify_const_binop (code, arg2, arg1, 1))) > return simplified; > > - if (TREE_CODE (arg1) == VECTOR_CST > - && TREE_CODE (arg2) == VECTOR_CST > - && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)), > - TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg2)))) > - { > - tree type = TREE_TYPE (arg1); > - bool step_ok_p; > - if (VECTOR_CST_STEPPED_P (arg1) > - && VECTOR_CST_STEPPED_P (arg2)) > - /* We can operate directly on the encoding if: > - > - a3 - a2 == a2 - a1 && b3 - b2 == b2 - b1 > - implies > - (a3 op b3) - (a2 op b2) == (a2 op b2) - (a1 op b1) > - > - Addition and subtraction are the supported operators > - for which this is true. */ > - step_ok_p = (code == PLUS_EXPR || code == MINUS_EXPR); > - else if (VECTOR_CST_STEPPED_P (arg1)) > - /* We can operate directly on stepped encodings if: > - > - a3 - a2 == a2 - a1 > - implies: > - (a3 op c) - (a2 op c) == (a2 op c) - (a1 op c) > - > - which is true if (x -> x op c) distributes over addition. */ > - step_ok_p = distributes_over_addition_p (code, 1); > - else > - /* Similarly in reverse. */ > - step_ok_p = distributes_over_addition_p (code, 2); > - tree_vector_builder elts; > - if (!elts.new_binary_operation (type, arg1, arg2, step_ok_p)) > - return NULL_TREE; > - unsigned int count = elts.encoded_nelts (); > - for (unsigned int i = 0; i < count; ++i) > - { > - tree elem1 = VECTOR_CST_ELT (arg1, i); > - tree elem2 = VECTOR_CST_ELT (arg2, i); > - > - tree elt = const_binop (code, elem1, elem2); > + if ((TREE_CODE (arg1) == VECTOR_CST > + && TREE_CODE (arg2) == VECTOR_CST > + && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)), > + TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg2)))) > + || (TREE_CODE (arg1) == VECTOR_CST > + && TREE_CODE (arg2) == INTEGER_CST)) > + return vector_const_binop (code, arg1, arg2, const_binop); Rather than do: if ((TREE_CODE (arg1) == VECTOR_CST && TREE_CODE (arg2) == VECTOR_CST && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)), TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg2)))) || (TREE_CODE (arg1) == VECTOR_CST && TREE_CODE (arg2) == INTEGER_CST)) return vector_const_binop (code, arg1, arg2, const_binop); return NULL_TREE; I think we should just tail-call vector_const_binop unconditionally: return vector_const_binop (code, arg1, arg2, const_binop); That should be more efficient, and also protects against any risk that the conditions would get out of sync. OK for the target-independent bits with that change, if there is no objection by this time on Monday. Thanks, Richard > > - /* It is possible that const_binop cannot handle the given > - code and return NULL_TREE */ > - if (elt == NULL_TREE) > - return NULL_TREE; > - elts.quick_push (elt); > - } > - > - return elts.build (); > - } > - > - /* Shifts allow a scalar offset for a vector. */ > - if (TREE_CODE (arg1) == VECTOR_CST > - && TREE_CODE (arg2) == INTEGER_CST) > - { > - tree type = TREE_TYPE (arg1); > - bool step_ok_p = distributes_over_addition_p (code, 1); > - tree_vector_builder elts; > - if (!elts.new_unary_operation (type, arg1, step_ok_p)) > - return NULL_TREE; > - unsigned int count = elts.encoded_nelts (); > - for (unsigned int i = 0; i < count; ++i) > - { > - tree elem1 = VECTOR_CST_ELT (arg1, i); > - > - tree elt = const_binop (code, elem1, arg2); > - > - /* It is possible that const_binop cannot handle the given > - code and return NULL_TREE. */ > - if (elt == NULL_TREE) > - return NULL_TREE; > - elts.quick_push (elt); > - } > - > - return elts.build (); > - } > return NULL_TREE; > } > > diff --git a/gcc/fold-const.h b/gcc/fold-const.h > index b82ef137e2f..3e3998b57b0 100644 > --- a/gcc/fold-const.h > +++ b/gcc/fold-const.h > @@ -126,6 +126,9 @@ extern tree fold_vec_perm (tree, tree, tree, const > vec_perm_indices &); > extern bool wide_int_binop (wide_int &res, enum tree_code, > const wide_int &arg1, const wide_int &arg2, > signop, wi::overflow_type *); > +extern bool poly_int_binop (poly_wide_int &res, enum tree_code, > + const_tree, const_tree, signop, > + wi::overflow_type *); > extern tree int_const_binop (enum tree_code, const_tree, const_tree, int = > 1); > #define build_fold_addr_expr(T)\ > build_fold_addr_expr_loc (UNKNOWN_LOCATION, (T)) > @@ -218,6 +221,8 @@ extern bool simple_condition_p (tree); > extern tree exact_inverse (tree, tree); > extern bool expr_not_equal_to (tree t, const wide_int &); > extern tree const_unop (enum tree_code, tree, tree); > +extern tree vector_const_binop (enum tree_code, tree, tree, > + tree (*) (enum tree_code, tree, tree)); > extern tree const_binop (enum tree_code, tree, tree, tree); > extern bool negate_mathfn_p (combined_fn); > extern const char *getbyterep (tree, unsigned HOST_WIDE_INT *);