On Tue, Apr 02, 2024 at 09:56:01AM +0200, Juergen Christ wrote: > Loop vectorizer can generate vector permutes with constant indexes > where all indexes are equal. Optimize this case to use vector > replicate instead of vector permute. > > gcc/ChangeLog: > > * config/s390/s390.cc (expand_perm_as_replicate): Implement. > (vectorize_vec_perm_const_1): Call new function. > * config/s390/vx-builtins.md (vec_splat<mode>): Change to... > (@vec_splat<mode>): ...this. > > gcc/testsuite/ChangeLog: > > * gcc.target/s390/vector/vec-expand-replicate.c: New test. > > Bootstrapped and regtested on s390x. Ok for trunk? > > Signed-off-by: Juergen Christ <jchr...@linux.ibm.com> > --- > gcc/config/s390/s390.cc | 32 +++++++++++++++++++ > gcc/config/s390/vx-builtins.md | 2 +- > .../s390/vector/vec-expand-replicate.c | 30 +++++++++++++++++ > 3 files changed, 63 insertions(+), 1 deletion(-) > create mode 100644 > gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c > > diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc > index 372a23244032..4b4014ebe444 100644 > --- a/gcc/config/s390/s390.cc > +++ b/gcc/config/s390/s390.cc > @@ -17923,6 +17923,35 @@ expand_perm_as_a_vlbr_vstbr_candidate (const struct > expand_vec_perm_d &d) > return false; > } > > +static bool expand_perm_as_replicate (const struct expand_vec_perm_d &d) ^~~~~~~~~~~~~~~~~~~~~~~~ Function names start on a new line.
> +{ > + unsigned char i; > + unsigned char elem; > + rtx base = d.op0; > + rtx insn; > + /* Needed to silence maybe-uninitialized warning. */ > + gcc_assert(d.nelt > 0); ~~~~~~~~~~^~~~~~~~~~~~ Between function name and open bracket whitespace is missing. Curiously enough, the error is about d which is a reference and cannot be null. If you are eager you could reduce this and open a PR. s390.cc:17935:8: warning: ādā may be used uninitialized [-Wmaybe-uninitialized] 17935 | elem = d.perm[0]; | ~~~~~^~~~~~~~~~~ > + elem = d.perm[0]; > + for (i = 1; i < d.nelt; ++i) > + if (d.perm[i] != elem) > + return false; > + if (!d.testing_p) > + { > + if (elem >= d.nelt) > + { > + base = d.op1; > + elem -= d.nelt; > + } > + insn = maybe_gen_vec_splat (d.vmode, d.target, base, GEN_INT (elem)); > + if (insn == NULL_RTX) > + return false; > + emit_insn (insn); > + return true; > + } > + else > + return maybe_code_for_vec_splat (d.vmode) != CODE_FOR_nothing; > +} > + > /* Try to find the best sequence for the vector permute operation > described by D. Return true if the operation could be > expanded. */ > @@ -17941,6 +17970,9 @@ vectorize_vec_perm_const_1 (const struct > expand_vec_perm_d &d) > if (expand_perm_as_a_vlbr_vstbr_candidate (d)) > return true; > > + if (expand_perm_as_replicate(d)) ~~~~~~~~~~~~~~~~~~~~~~~~^~~ Between function name and open bracket whitespace is missing. > + return true; > + > return false; > } > > diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md > index 432d81a719fc..93c0d408a43e 100644 > --- a/gcc/config/s390/vx-builtins.md > +++ b/gcc/config/s390/vx-builtins.md > @@ -424,7 +424,7 @@ > > > ; Replicate from vector element > -(define_expand "vec_splat<mode>" > +(define_expand "@vec_splat<mode>" > [(set (match_operand:V_HW 0 "register_operand" "") > (vec_duplicate:V_HW (vec_select:<non_vec> > (match_operand:V_HW 1 "register_operand" "") > diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c > b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c > new file mode 100644 > index 000000000000..27563a00f22b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c > @@ -0,0 +1,30 @@ > +/* Check that the vectorize_vec_perm_const expander correctly deals with > + replication. Extracted from spec "nab". */ > + > +/* { dg-do compile } */ > +/* { dg-options "-O3 -mzarch -march=z13 -fvect-cost-model=unlimited" } */ > + > + > +#define REAL_T double > +typedef REAL_T MATRIX_T[ 4 ][ 4 ]; > + > +int concat_mat_i, concat_mat_j; > +static void concat_mat(MATRIX_T m1, MATRIX_T, MATRIX_T m3); > +MATRIX_T *rot4p() { > + MATRIX_T mat3, mat4; > + static MATRIX_T mat5; > + concat_mat(mat4, mat3, mat5); > +} > +void concat_mat(MATRIX_T m1, MATRIX_T, MATRIX_T m3) { > + int k; > + for (;; concat_mat_i++) { > + concat_mat_j = 0; > + for (; 4; concat_mat_j++) { > + k = 0; > + for (; k < 4; k++) > + m3[concat_mat_i][concat_mat_j] += m1[concat_mat_i][k]; > + } Just nitpicking, if we could come up with a test case which does not involve integer overflows due to non-terminating loops, I would prefer that. Cheers, Stefan > + } > +} > + > +/* { dg-final { scan-assembler-not "vperm" } } */ > -- > 2.39.3 >