Jennifer Schmitz <jschm...@nvidia.com> writes: > As follow-up to > https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665472.html, > this patch implements folding of svmul by -1 to svneg for > unsigned SVE vector types. The key idea is to reuse the existing code that > does this fold for signed types and feed it as callback to a helper function > that adds the necessary type conversions. > > For example, for the test case > svuint64_t foo (svuint64_t x, svbool_t pg) > { > return svmul_n_u64_x (pg, x, -1); > } > > the following gimple sequence is emitted (-O2 -mcpu=grace): > svuint64_t foo (svuint64_t x, svbool_t pg) > { > svint64_t D.12921; > svint64_t D.12920; > svuint64_t D.12919; > > D.12920 = VIEW_CONVERT_EXPR<svint64_t>(x); > D.12921 = svneg_s64_x (pg, D.12920); > D.12919 = VIEW_CONVERT_EXPR<svuint64_t>(D.12921); > goto <D.12922>; > <D.12922>: > return D.12919; > } > > In general, the new helper gimple_folder::convert_and_fold > - takes a target type and a function pointer, > - converts the lhs and all non-boolean vector types to the target type, > - passes the converted lhs and arguments to the callback, > - receives the new gimple statement from the callback function, > - adds the necessary view converts to the gimple sequence, > - and returns the new call. > > Because all arguments are converted to the same target types, the helper > function is only suitable for folding calls whose arguments are all of > the same type. If necessary, this could be extended to convert the > arguments to different types differentially. > > The patch was bootstrapped and tested on aarch64-linux-gnu, no regression. > OK for mainline? > > Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> > > gcc/ChangeLog: > > * config/aarch64/aarch64-sve-builtins-base.cc > (svmul_impl::fold): Wrap code for folding to svneg in lambda > function and pass to gimple_folder::convert_and_fold to enable > the transform for unsigned types. > * config/aarch64/aarch64-sve-builtins.cc > (gimple_folder::convert_and_fold): New function that converts > operands to target type before calling callback function, adding the > necessary conversion statements. > * config/aarch64/aarch64-sve-builtins.h > (gimple_folder::convert_and_fold): Declare function. > (signed_type_suffix_index): Return type_suffix_index of signed > vector type for given width. > (function_instance::signed_type): Return signed vector type for > given width. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/sve/acle/asm/mul_u8.c: Adjust expected outcome. > * gcc.target/aarch64/sve/acle/asm/mul_u16.c: Likewise. > * gcc.target/aarch64/sve/acle/asm/mul_u32.c: Likewise. > * gcc.target/aarch64/sve/acle/asm/mul_u64.c: New test and adjust > expected outcome. > --- > .../aarch64/aarch64-sve-builtins-base.cc | 70 +++++++++++++------ > gcc/config/aarch64/aarch64-sve-builtins.cc | 43 ++++++++++++ > gcc/config/aarch64/aarch64-sve-builtins.h | 31 ++++++++ > .../gcc.target/aarch64/sve/acle/asm/mul_u16.c | 5 +- > .../gcc.target/aarch64/sve/acle/asm/mul_u32.c | 5 +- > .../gcc.target/aarch64/sve/acle/asm/mul_u64.c | 26 ++++++- > .../gcc.target/aarch64/sve/acle/asm/mul_u8.c | 7 +- > 7 files changed, 153 insertions(+), 34 deletions(-) > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > index 87e9909b55a..52401a8c57a 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > @@ -2092,33 +2092,61 @@ public: > return f.fold_active_lanes_to (build_zero_cst (TREE_TYPE (f.lhs))); > > /* If one of the operands is all integer -1, fold to svneg. */ > - tree pg = gimple_call_arg (f.call, 0); > - tree negated_op = NULL; > - if (integer_minus_onep (op2)) > - negated_op = op1; > - else if (integer_minus_onep (op1)) > - negated_op = op2; > - if (!f.type_suffix (0).unsigned_p && negated_op) > + if (integer_minus_onep (op1) || integer_minus_onep (op2)) > { > - function_instance instance ("svneg", functions::svneg, > - shapes::unary, MODE_none, > - f.type_suffix_ids, GROUP_none, f.pred); > - gcall *call = f.redirect_call (instance); > - unsigned offset_index = 0; > - if (f.pred == PRED_m) > + auto mul_by_m1 = [](gimple_folder &f, tree lhs_conv, > + vec<tree> &args_conv) -> gimple * > { > - offset_index = 1; > - gimple_call_set_arg (call, 0, op1); > - } > - else > - gimple_set_num_ops (call, 5); > - gimple_call_set_arg (call, offset_index, pg); > - gimple_call_set_arg (call, offset_index + 1, negated_op); > - return call; > + gcc_assert (lhs_conv && args_conv.length () == 3); > + tree pg = args_conv[0]; > + tree op1 = args_conv[1]; > + tree op2 = args_conv[2]; > + tree negated_op = op1; > + bool negate_op1 = true; > + if (integer_minus_onep (op1)) > + { > + negated_op = op2; > + negate_op1 = false; > + } > + type_suffix_pair signed_tsp = > + {signed_type_suffix_index (f.type_suffix (0).element_bits), > + f.type_suffix_ids[1]}; > + function_instance instance ("svneg", functions::svneg, > + shapes::unary, MODE_none, > + signed_tsp, GROUP_none, f.pred); > + gcall *call = f.redirect_call (instance); > + gimple_call_set_lhs (call, lhs_conv); > + unsigned offset = 0; > + tree fntype, op1_type = TREE_TYPE (op1); > + if (f.pred == PRED_m) > + { > + offset = 1; > + tree arg_types[3] = {op1_type, TREE_TYPE (pg), op1_type}; > + fntype = build_function_type_array (TREE_TYPE (lhs_conv), > + 3, arg_types); > + tree ty = f.signed_type (f.type_suffix (0).element_bits); > + tree inactive = negate_op1 ? op1 : build_minus_one_cst (ty);
I think my question from the previous review still stands: Isn't the old approach of using op1 as the inactive argument still correct? > + gimple_call_set_arg (call, 0, inactive); > + } > + else > + { > + gimple_set_num_ops (call, 5); > + tree arg_types[2] = {TREE_TYPE (pg), op1_type}; > + fntype = build_function_type_array (TREE_TYPE (lhs_conv), > + 2, arg_types); > + } > + gimple_call_set_fntype (call, fntype); I think the fntype should be set by redirect_call, which can get the type from TREE_TYPE (rfn->decl). That's better because it includes any relevant attributes. > + gimple_call_set_arg (call, offset, pg); > + gimple_call_set_arg (call, offset + 1, negated_op); > + return call; > + }; > + tree ty = f.signed_type (f.type_suffix (0).element_bits); > + return f.convert_and_fold (ty, mul_by_m1); > } > > /* If one of the operands is a uniform power of 2, fold to a left shift > by immediate. */ > + tree pg = gimple_call_arg (f.call, 0); > tree op1_cst = uniform_integer_cst_p (op1); > tree op2_cst = uniform_integer_cst_p (op2); > tree shift_op1, shift_op2 = NULL; > diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc > b/gcc/config/aarch64/aarch64-sve-builtins.cc > index 0fec1cd439e..01b0da22c9b 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc > @@ -3646,6 +3646,49 @@ gimple_folder::redirect_pred_x () > return redirect_call (instance); > } > > +/* Convert the lhs and all non-boolean vector-type operands to TYPE. > + Pass the converted variables to the callback FP, and finally convert the > + result back to the original type. Add the necessary conversion statements. > + Return the new call. */ > +gimple * > +gimple_folder::convert_and_fold (tree type, > + gimple *(*fp) (gimple_folder &, > + tree, vec<tree> &)) > +{ > + gcc_assert (VECTOR_TYPE_P (type) > + && TYPE_MODE (type) != VNx16BImode); > + tree old_ty = TREE_TYPE (lhs); > + gimple_seq stmts = NULL; > + tree lhs_conv, op, op_ty, t; > + gimple *g, *new_stmt; > + bool convert_lhs_p = !useless_type_conversion_p (type, old_ty); > + lhs_conv = convert_lhs_p ? create_tmp_var (type) : lhs; > + unsigned int num_args = gimple_call_num_args (call); > + vec<tree> args_conv = vNULL; It'd be slightly more efficient to use something like: auto_vec<tree, 16> args_conv; here. This also removes the need to free the vector after use (looks like the current version would leak). > + args_conv.safe_grow (num_args); > + for (unsigned int i = 0; i < num_args; ++i) > + { > + op = gimple_call_arg (call, i); > + op_ty = TREE_TYPE (op); > + args_conv[i] = > + (VECTOR_TYPE_P (op_ty) > + && TREE_CODE (op) != VECTOR_CST > + && TYPE_MODE (op_ty) != VNx16BImode > + && !useless_type_conversion_p (op_ty, type)) I think we should convert VECTOR_CSTs too. gimple_build should fold it for us (but let me know if it doesn't). Thanks, and sorry for the slow review. Richard > + ? gimple_build (&stmts, VIEW_CONVERT_EXPR, type, op) : op; > + } > + > + new_stmt = fp (*this, lhs_conv, args_conv); > + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); > + if (convert_lhs_p) > + { > + t = build1 (VIEW_CONVERT_EXPR, old_ty, lhs_conv); > + g = gimple_build_assign (lhs, VIEW_CONVERT_EXPR, t); > + gsi_insert_after (gsi, g, GSI_SAME_STMT); > + } > + return new_stmt; > +} > + > /* Fold the call to constant VAL. */ > gimple * > gimple_folder::fold_to_cstu (poly_uint64 val) > diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h > b/gcc/config/aarch64/aarch64-sve-builtins.h > index 4094f8207f9..3e919e52e2c 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins.h > +++ b/gcc/config/aarch64/aarch64-sve-builtins.h > @@ -406,6 +406,7 @@ public: > tree scalar_type (unsigned int) const; > tree vector_type (unsigned int) const; > tree tuple_type (unsigned int) const; > + tree signed_type (unsigned int) const; > unsigned int elements_per_vq (unsigned int) const; > machine_mode vector_mode (unsigned int) const; > machine_mode tuple_mode (unsigned int) const; > @@ -632,6 +633,8 @@ public: > > gcall *redirect_call (const function_instance &); > gimple *redirect_pred_x (); > + gimple *convert_and_fold (tree, gimple *(*) (gimple_folder &, > + tree, vec<tree> &)); > > gimple *fold_to_cstu (poly_uint64); > gimple *fold_to_pfalse (); > @@ -864,6 +867,20 @@ find_type_suffix (type_class_index tclass, unsigned int > element_bits) > gcc_unreachable (); > } > > +/* Return the type suffix of the signed type of width ELEMENT_BITS. */ > +inline type_suffix_index > +signed_type_suffix_index (unsigned int element_bits) > +{ > + switch (element_bits) > + { > + case 8: return TYPE_SUFFIX_s8; > + case 16: return TYPE_SUFFIX_s16; > + case 32: return TYPE_SUFFIX_s32; > + case 64: return TYPE_SUFFIX_s64; > + } > + gcc_unreachable (); > +} > + > /* Return the single field in tuple type TYPE. */ > inline tree > tuple_type_field (tree type) > @@ -1029,6 +1046,20 @@ function_instance::tuple_type (unsigned int i) const > return acle_vector_types[num_vectors - 1][type_suffix (i).vector_type]; > } > > +/* Return the signed vector type of width ELEMENT_BITS. */ > +inline tree > +function_instance::signed_type (unsigned int element_bits) const > +{ > + switch (element_bits) > + { > + case 8: return acle_vector_types[0][VECTOR_TYPE_svint8_t]; > + case 16: return acle_vector_types[0][VECTOR_TYPE_svint16_t]; > + case 32: return acle_vector_types[0][VECTOR_TYPE_svint32_t]; > + case 64: return acle_vector_types[0][VECTOR_TYPE_svint64_t]; > + } > + gcc_unreachable (); > +} > + > /* Return the number of elements of type suffix I that fit within a > 128-bit block. */ > inline unsigned int > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u16.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u16.c > index bdf6fcb98d6..e228dc5995d 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u16.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u16.c > @@ -174,8 +174,7 @@ TEST_UNIFORM_Z (mul_3_u16_m_untied, svuint16_t, > > /* > ** mul_m1_u16_m: > -** mov (z[0-9]+)\.b, #-1 > -** mul z0\.h, p0/m, z0\.h, \1\.h > +** neg z0\.h, p0/m, z0\.h > ** ret > */ > TEST_UNIFORM_Z (mul_m1_u16_m, svuint16_t, > @@ -569,7 +568,7 @@ TEST_UNIFORM_Z (mul_255_u16_x, svuint16_t, > > /* > ** mul_m1_u16_x: > -** mul z0\.h, z0\.h, #-1 > +** neg z0\.h, p0/m, z0\.h > ** ret > */ > TEST_UNIFORM_Z (mul_m1_u16_x, svuint16_t, > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u32.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u32.c > index a61e85fa12d..e8f52c9d785 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u32.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u32.c > @@ -174,8 +174,7 @@ TEST_UNIFORM_Z (mul_3_u32_m_untied, svuint32_t, > > /* > ** mul_m1_u32_m: > -** mov (z[0-9]+)\.b, #-1 > -** mul z0\.s, p0/m, z0\.s, \1\.s > +** neg z0\.s, p0/m, z0\.s > ** ret > */ > TEST_UNIFORM_Z (mul_m1_u32_m, svuint32_t, > @@ -569,7 +568,7 @@ TEST_UNIFORM_Z (mul_255_u32_x, svuint32_t, > > /* > ** mul_m1_u32_x: > -** mul z0\.s, z0\.s, #-1 > +** neg z0\.s, p0/m, z0\.s > ** ret > */ > TEST_UNIFORM_Z (mul_m1_u32_x, svuint32_t, > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u64.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u64.c > index eee1f8a0c99..2ccdc3642c5 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u64.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u64.c > @@ -183,14 +183,25 @@ TEST_UNIFORM_Z (mul_3_u64_m_untied, svuint64_t, > > /* > ** mul_m1_u64_m: > -** mov (z[0-9]+)\.b, #-1 > -** mul z0\.d, p0/m, z0\.d, \1\.d > +** neg z0\.d, p0/m, z0\.d > ** ret > */ > TEST_UNIFORM_Z (mul_m1_u64_m, svuint64_t, > z0 = svmul_n_u64_m (p0, z0, -1), > z0 = svmul_m (p0, z0, -1)) > > +/* > +** mul_m1r_u64_m: > +** mov (z[0-9]+)\.b, #-1 > +** mov (z[0-9]+\.d), z0\.d > +** movprfx z0, \1 > +** neg z0\.d, p0/m, \2 > +** ret > +*/ > +TEST_UNIFORM_Z (mul_m1r_u64_m, svuint64_t, > + z0 = svmul_u64_m (p0, svdup_u64 (-1), z0), > + z0 = svmul_m (p0, svdup_u64 (-1), z0)) > + > /* > ** mul_u64_z_tied1: > ** movprfx z0\.d, p0/z, z0\.d > @@ -597,13 +608,22 @@ TEST_UNIFORM_Z (mul_255_u64_x, svuint64_t, > > /* > ** mul_m1_u64_x: > -** mul z0\.d, z0\.d, #-1 > +** neg z0\.d, p0/m, z0\.d > ** ret > */ > TEST_UNIFORM_Z (mul_m1_u64_x, svuint64_t, > z0 = svmul_n_u64_x (p0, z0, -1), > z0 = svmul_x (p0, z0, -1)) > > +/* > +** mul_m1r_u64_x: > +** neg z0\.d, p0/m, z0\.d > +** ret > +*/ > +TEST_UNIFORM_Z (mul_m1r_u64_x, svuint64_t, > + z0 = svmul_u64_x (p0, svdup_u64 (-1), z0), > + z0 = svmul_x (p0, svdup_u64 (-1), z0)) > + > /* > ** mul_m127_u64_x: > ** mul z0\.d, z0\.d, #-127 > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u8.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u8.c > index 06ee1b3e7c8..8e53a4821f0 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u8.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u8.c > @@ -174,8 +174,7 @@ TEST_UNIFORM_Z (mul_3_u8_m_untied, svuint8_t, > > /* > ** mul_m1_u8_m: > -** mov (z[0-9]+)\.b, #-1 > -** mul z0\.b, p0/m, z0\.b, \1\.b > +** neg z0\.b, p0/m, z0\.b > ** ret > */ > TEST_UNIFORM_Z (mul_m1_u8_m, svuint8_t, > @@ -559,7 +558,7 @@ TEST_UNIFORM_Z (mul_128_u8_x, svuint8_t, > > /* > ** mul_255_u8_x: > -** mul z0\.b, z0\.b, #-1 > +** neg z0\.b, p0/m, z0\.b > ** ret > */ > TEST_UNIFORM_Z (mul_255_u8_x, svuint8_t, > @@ -568,7 +567,7 @@ TEST_UNIFORM_Z (mul_255_u8_x, svuint8_t, > > /* > ** mul_m1_u8_x: > -** mul z0\.b, z0\.b, #-1 > +** neg z0\.b, p0/m, z0\.b > ** ret > */ > TEST_UNIFORM_Z (mul_m1_u8_x, svuint8_t,