Loop vectorizer can generate vector permutes with constant indexes where all indexes are equal. Optimize this case to use vector replicate instead of vector permute.
gcc/ChangeLog: * config/s390/s390.cc (expand_perm_as_replicate): Implement. (vectorize_vec_perm_const_1): Call new function. * config/s390/vx-builtins.md (vec_splat<mode>): Change to... (@vec_splat<mode>): ...this. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/vec-expand-replicate.c: New test. Bootstrapped and regtested on s390x. Ok for trunk? Signed-off-by: Juergen Christ <jchr...@linux.ibm.com> --- gcc/config/s390/s390.cc | 33 ++++++++++ gcc/config/s390/vx-builtins.md | 2 +- .../s390/vector/vec-expand-replicate.c | 60 +++++++++++++++++++ 3 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 372a23244032..3148f163627c 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -17923,6 +17923,36 @@ expand_perm_as_a_vlbr_vstbr_candidate (const struct expand_vec_perm_d &d) return false; } +static bool +expand_perm_as_replicate (const struct expand_vec_perm_d &d) +{ + unsigned char i; + unsigned char elem; + rtx base = d.op0; + rtx insn; + /* Needed to silence maybe-uninitialized warning. */ + gcc_assert (d.nelt > 0); + elem = d.perm[0]; + for (i = 1; i < d.nelt; ++i) + if (d.perm[i] != elem) + return false; + if (!d.testing_p) + { + if (elem >= d.nelt) + { + base = d.op1; + elem -= d.nelt; + } + insn = maybe_gen_vec_splat (d.vmode, d.target, base, GEN_INT (elem)); + if (insn == NULL_RTX) + return false; + emit_insn (insn); + return true; + } + else + return maybe_code_for_vec_splat (d.vmode) != CODE_FOR_nothing; +} + /* Try to find the best sequence for the vector permute operation described by D. Return true if the operation could be expanded. */ @@ -17941,6 +17971,9 @@ vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d) if (expand_perm_as_a_vlbr_vstbr_candidate (d)) return true; + if (expand_perm_as_replicate (d)) + return true; + return false; } diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md index 432d81a719fc..93c0d408a43e 100644 --- a/gcc/config/s390/vx-builtins.md +++ b/gcc/config/s390/vx-builtins.md @@ -424,7 +424,7 @@ ; Replicate from vector element -(define_expand "vec_splat<mode>" +(define_expand "@vec_splat<mode>" [(set (match_operand:V_HW 0 "register_operand" "") (vec_duplicate:V_HW (vec_select:<non_vec> (match_operand:V_HW 1 "register_operand" "") diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c new file mode 100644 index 000000000000..872b1c9321cd --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c @@ -0,0 +1,60 @@ +/* Check that the vectorize_vec_perm_const expander correctly deals with + replication. Extracted from spec "nab". */ + +/* { dg-do compile } */ +/* { dg-options "-O3 -mzarch -march=z13 -fvect-cost-model=unlimited" } */ + +typedef double POINT_T[3]; +typedef double MATRIX_T[][4]; +typedef struct { + POINT_T a_pos; +} ATOM_T; +typedef struct { + ATOM_T *r_atoms; +} RESIDUE_T; +typedef struct strand_t { + RESIDUE_T *s_residues; +} STRAND_T; +typedef struct strand_t MOLECULE_T; +double xfm_xyz_oxyz4[4]; +MOLECULE_T add_he2o3transformmol_mol, add_he2o3transformmol_sp; +RESIDUE_T add_he2o3transformmol_res; +int add_he2o3transformmol_r, add_he2o3transformmol_a, add_he2o3transformmol_i; +ATOM_T *add_he2o3transformmol_ap; +POINT_T add_he2o3transformmol_xyz, add_he2o3transformmol_nxyz; +static void xfm_xyz(POINT_T oxyz, MATRIX_T mat, POINT_T nxyz) { + int i, j; + double nxyz4[4]; + for (i = 0; i < 3; i++) + xfm_xyz_oxyz4[i] = oxyz[i]; + xfm_xyz_oxyz4[3] = 1.0; + for (i = 0; i < 4; i++) { + nxyz4[i] = 0.0; + for (j = 0; j < 4; j++) + nxyz4[i] += xfm_xyz_oxyz4[j] * mat[j][i]; + } + for (i = 0; i < 3; i++) + nxyz[i] = nxyz4[i]; +} +void add_he2o3transformmol(MATRIX_T mat, int n) { + for (add_he2o3transformmol_sp = add_he2o3transformmol_mol;;) + for (add_he2o3transformmol_r = 0;;) { + add_he2o3transformmol_res = + add_he2o3transformmol_sp.s_residues[add_he2o3transformmol_r]; + for (add_he2o3transformmol_a = 0; add_he2o3transformmol_a < n; add_he2o3transformmol_a++) { + add_he2o3transformmol_ap = + &add_he2o3transformmol_res.r_atoms[add_he2o3transformmol_a]; + for (add_he2o3transformmol_i = 0; add_he2o3transformmol_i < 3; + add_he2o3transformmol_i++) + add_he2o3transformmol_xyz[add_he2o3transformmol_i] = + add_he2o3transformmol_ap->a_pos[add_he2o3transformmol_i]; + xfm_xyz(add_he2o3transformmol_xyz, mat, add_he2o3transformmol_nxyz); + for (add_he2o3transformmol_i = 0; add_he2o3transformmol_i < 3; + add_he2o3transformmol_i++) + add_he2o3transformmol_ap->a_pos[add_he2o3transformmol_i] = + add_he2o3transformmol_nxyz[add_he2o3transformmol_i]; + } + } +} + +/* { dg-final { scan-assembler-not "vperm" } } */ -- 2.39.3