[PATCH] s390: fix htm-builtins test cases
Transactional and non-transactional stores to the same cache line cause transactions to abort on newer generations. Add sufficient padding to make sure another cache line is used. Tested on s390. gcc/testsuite/ChangeLog: * gcc.target/s390/htm-builtins-1.c: Fix. * gcc.target/s390/htm-builtins-2.c: Fix. Signed-off-by: Juergen Christ --- gcc/testsuite/gcc.target/s390/htm-builtins-1.c | 4 +++- gcc/testsuite/gcc.target/s390/htm-builtins-2.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/gcc/testsuite/gcc.target/s390/htm-builtins-1.c b/gcc/testsuite/gcc.target/s390/htm-builtins-1.c index ff43be9fe736..4f95bf3accaa 100644 --- a/gcc/testsuite/gcc.target/s390/htm-builtins-1.c +++ b/gcc/testsuite/gcc.target/s390/htm-builtins-1.c @@ -53,9 +53,11 @@ __attribute__ ((aligned(256))) struct __attribute__ ((aligned(256))) struct { volatile uint64_t c1; + char pad1[256 - sizeof(uint64_t)]; volatile uint64_t c2; + char pad2[256 - sizeof(uint64_t)]; volatile uint64_t c3; -} counters = { 0, 0, 0 }; +} counters = { 0 }; /* local helper functions - */ diff --git a/gcc/testsuite/gcc.target/s390/htm-builtins-2.c b/gcc/testsuite/gcc.target/s390/htm-builtins-2.c index bb9d346ea560..2e838caacc8c 100644 --- a/gcc/testsuite/gcc.target/s390/htm-builtins-2.c +++ b/gcc/testsuite/gcc.target/s390/htm-builtins-2.c @@ -94,9 +94,11 @@ float global_float_3 = 0.0; __attribute__ ((aligned(256))) struct { volatile uint64_t c1; + char pad1[256 - sizeof(uint64_t)]; volatile uint64_t c2; + char pad2[256 - sizeof(uint64_t)]; volatile uint64_t c3; -} counters = { 0, 0, 0 }; +} counters = { 0 }; /* local helper functions - */ -- 2.39.3
[PATCH] s390x: Implement vector cost model
Hi, s390x used the basic cost model which does not correctly model the cost of register file crossing or the availability of certain instructions to simplify reversed operations. Implement an own cost model to better control when to vectorize. gcc/ChangeLog: * config/s390/s390.cc (class s390_vector_costs): Implement. (s390_vector_costs::s390_vector_costs): Dito. (s390_vector_costs::add_stmt_cost): Dito. (s390_vectorize_create_costs): Dito. (TARGET_VECTORIZE_CREATE_COSTS): Dito. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/loop-1.c: New test. * gcc.target/s390/vector/slp-1.c: New test. * gcc.target/s390/vector/slp-2.c: New test. Signed-off-by: Juergen Christ Bootstrapped and tested on s390x. Ok for master? --- gcc/config/s390/s390.cc | 127 ++ gcc/testsuite/gcc.target/s390/vector/loop-1.c | 82 +++ gcc/testsuite/gcc.target/s390/vector/slp-1.c | 68 ++ gcc/testsuite/gcc.target/s390/vector/slp-2.c | 31 + 4 files changed, 308 insertions(+) create mode 100644 gcc/testsuite/gcc.target/s390/vector/loop-1.c create mode 100644 gcc/testsuite/gcc.target/s390/vector/slp-1.c create mode 100644 gcc/testsuite/gcc.target/s390/vector/slp-2.c diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 372a23244032..b9dab1cf8a85 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -88,6 +88,7 @@ along with GCC; see the file COPYING3. If not see #include "ipa-prop.h" #include "ipa-fnsummary.h" #include "sched-int.h" +#include "tree-vectorizer.h" /* This file should be included last. */ #include "target-def.h" @@ -4199,6 +4200,130 @@ s390_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, } } +/* s390-specific vector costs */ +class s390_vector_costs : public vector_costs +{ + stmt_vec_info skipfinalpart; +public: + s390_vector_costs (vec_info *, bool); + + unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind, + stmt_vec_info stmt_info, slp_tree node, + tree vectype, int misalign, + vect_cost_model_location where) override; +}; + +s390_vector_costs::s390_vector_costs(vec_info *vinfo, bool costing_for_scalar) + : vector_costs(vinfo, costing_for_scalar) +{ +} + +unsigned int +s390_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, + stmt_vec_info stmt_info, slp_tree node, + tree vectype, int misalign, + vect_cost_model_location where) +{ + bool fp = false; + int costs = s390_builtin_vectorization_cost (kind, vectype, misalign); + + if (vectype != NULL) +fp = FLOAT_TYPE_P (vectype); + + if ((kind == scalar_to_vec || kind == vec_construct) + && node + && SLP_TREE_DEF_TYPE (node) == vect_external_def) +{ + unsigned int i; + tree op; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) + if (TREE_CODE (op) == SSA_NAME) + TREE_VISITED (op) = 0; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) + { + if (TREE_CODE (op) != SSA_NAME + || TREE_VISITED (op)) + continue; + TREE_VISITED (op) = 1; + gimple *def = SSA_NAME_DEF_STMT (op); + tree temp; + if (is_gimple_assign(def) + && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)) + && (temp = gimple_assign_rhs1(def)) + && TREE_CODE (temp) == SSA_NAME + && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (def)), + TREE_TYPE (temp))) + def = SSA_NAME_DEF_STMT (temp); + if (!gimple_assign_load_p (def)) + { + /* For scalar_to_vec from a fp register, we might not +cross the register files. So keep the penalty small. +??? If we have to cross, we actually cross twice +leading to a huge runtime penalty. Should we reflect +this here? */ + if (kind == scalar_to_vec && fp) + costs += 2; + else + costs += 3; + } + } + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) + if (TREE_CODE (op) == SSA_NAME) + TREE_VISITED (op) = 0; +} + if (kind == scalar_stmt && stmt_info && is_gimple_assign (stmt_info->stmt)) +{ + const gassign *assign = dyn_cast (stmt_info->stmt); + tree comptype = NULL_TREE; + if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR) + comptype = TREE_TYPE (gimple_assign_rhs1 (assign)); + if (gimple_assign_rhs_code (assign) == BIT_FIELD_REF) + comptype = TREE_TYPE (TREE_OPERAND (gimple_assign_rhs1 (ass
[PATCH] s390x: Optimize vector permute with constant indexes
Loop vectorizer can generate vector permutes with constant indexes where all indexes are equal. Optimize this case to use vector replicate instead of vector permute. gcc/ChangeLog: * config/s390/s390.cc (expand_perm_as_replicate): Implement. (vectorize_vec_perm_const_1): Call new function. * config/s390/vx-builtins.md (vec_splat): Change to... (@vec_splat): ...this. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/vec-expand-replicate.c: New test. Bootstrapped and regtested on s390x. Ok for trunk? Signed-off-by: Juergen Christ --- gcc/config/s390/s390.cc | 32 +++ gcc/config/s390/vx-builtins.md| 2 +- .../s390/vector/vec-expand-replicate.c| 30 + 3 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 372a23244032..4b4014ebe444 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -17923,6 +17923,35 @@ expand_perm_as_a_vlbr_vstbr_candidate (const struct expand_vec_perm_d &d) return false; } +static bool expand_perm_as_replicate (const struct expand_vec_perm_d &d) +{ + unsigned char i; + unsigned char elem; + rtx base = d.op0; + rtx insn; + /* Needed to silence maybe-uninitialized warning. */ + gcc_assert(d.nelt > 0); + elem = d.perm[0]; + for (i = 1; i < d.nelt; ++i) +if (d.perm[i] != elem) + return false; + if (!d.testing_p) +{ + if (elem >= d.nelt) + { + base = d.op1; + elem -= d.nelt; + } + insn = maybe_gen_vec_splat (d.vmode, d.target, base, GEN_INT (elem)); + if (insn == NULL_RTX) + return false; + emit_insn (insn); + return true; +} + else +return maybe_code_for_vec_splat (d.vmode) != CODE_FOR_nothing; +} + /* Try to find the best sequence for the vector permute operation described by D. Return true if the operation could be expanded. */ @@ -17941,6 +17970,9 @@ vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d) if (expand_perm_as_a_vlbr_vstbr_candidate (d)) return true; + if (expand_perm_as_replicate(d)) +return true; + return false; } diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md index 432d81a719fc..93c0d408a43e 100644 --- a/gcc/config/s390/vx-builtins.md +++ b/gcc/config/s390/vx-builtins.md @@ -424,7 +424,7 @@ ; Replicate from vector element -(define_expand "vec_splat" +(define_expand "@vec_splat" [(set (match_operand:V_HW 0 "register_operand" "") (vec_duplicate:V_HW (vec_select: (match_operand:V_HW 1 "register_operand" "") diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c new file mode 100644 index ..27563a00f22b --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c @@ -0,0 +1,30 @@ +/* Check that the vectorize_vec_perm_const expander correctly deals with + replication. Extracted from spec "nab". */ + +/* { dg-do compile } */ +/* { dg-options "-O3 -mzarch -march=z13 -fvect-cost-model=unlimited" } */ + + +#define REAL_T double +typedef REAL_T MATRIX_T[ 4 ][ 4 ]; + +int concat_mat_i, concat_mat_j; +static void concat_mat(MATRIX_T m1, MATRIX_T, MATRIX_T m3); +MATRIX_T *rot4p() { + MATRIX_T mat3, mat4; + static MATRIX_T mat5; + concat_mat(mat4, mat3, mat5); +} +void concat_mat(MATRIX_T m1, MATRIX_T, MATRIX_T m3) { + int k; + for (;; concat_mat_i++) { +concat_mat_j = 0; +for (; 4; concat_mat_j++) { + k = 0; + for (; k < 4; k++) +m3[concat_mat_i][concat_mat_j] += m1[concat_mat_i][k]; +} + } +} + +/* { dg-final { scan-assembler-not "vperm" } } */ -- 2.39.3
Re: [PATCH] s390x: Optimize vector permute with constant indexes
Am Tue, Apr 09, 2024 at 11:51:00AM +0200 schrieb Stefan Schulze Frielinghaus: > > +static bool expand_perm_as_replicate (const struct expand_vec_perm_d &d) >^~~~ > Function names start on a new line. Fixed > > +{ > > + unsigned char i; > > + unsigned char elem; > > + rtx base = d.op0; > > + rtx insn; > > + /* Needed to silence maybe-uninitialized warning. */ > > + gcc_assert(d.nelt > 0); > ~~^~~~ > Between function name and open bracket whitespace is missing. Fixed. > Curiously enough, the error is about d which is a reference and cannot > be null. If you are eager you could reduce this and open a PR. > > s390.cc:17935:8: warning: ‘d’ may be used uninitialized > [-Wmaybe-uninitialized] > 17935 | elem = d.perm[0]; > | ~^~~ Weirdly enough it is not `d`, but `d.perm[0]` that seems to be the problem. But I did not reduce this. As the assertion suggests, it is known that all elements in d.perm in the range [0,d.nelts) are initialized. I would like to defer that to a time when I (hopefully) have some more spare time. > > + if (expand_perm_as_replicate(d)) > ^~~ > Between function name and open bracket whitespace is missing. Fixed > > diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c > > b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c > > new file mode 100644 > > index ..27563a00f22b > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c > > @@ -0,0 +1,30 @@ > > +/* Check that the vectorize_vec_perm_const expander correctly deals with > > + replication. Extracted from spec "nab". */ > > + > > +/* { dg-do compile } */ > > +/* { dg-options "-O3 -mzarch -march=z13 -fvect-cost-model=unlimited" } */ > > + > > + > > +#define REAL_T double > > +typedef REAL_T MATRIX_T[ 4 ][ 4 ]; > > + > > +int concat_mat_i, concat_mat_j; > > +static void concat_mat(MATRIX_T m1, MATRIX_T, MATRIX_T m3); > > +MATRIX_T *rot4p() { > > + MATRIX_T mat3, mat4; > > + static MATRIX_T mat5; > > + concat_mat(mat4, mat3, mat5); > > +} > > +void concat_mat(MATRIX_T m1, MATRIX_T, MATRIX_T m3) { > > + int k; > > + for (;; concat_mat_i++) { > > +concat_mat_j = 0; > > +for (; 4; concat_mat_j++) { > > + k = 0; > > + for (; k < 4; k++) > > +m3[concat_mat_i][concat_mat_j] += m1[concat_mat_i][k]; > > +} > > Just nitpicking, if we could come up with a test case which does not > involve integer overflows due to non-terminating loops, I would prefer > that. Well, I have a version without integer overflows, but it still has non-terminating loops... Will send a v2, Juergen
[PATCH v2] s390x: Optimize vector permute with constant indexes
Loop vectorizer can generate vector permutes with constant indexes where all indexes are equal. Optimize this case to use vector replicate instead of vector permute. gcc/ChangeLog: * config/s390/s390.cc (expand_perm_as_replicate): Implement. (vectorize_vec_perm_const_1): Call new function. * config/s390/vx-builtins.md (vec_splat): Change to... (@vec_splat): ...this. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/vec-expand-replicate.c: New test. Bootstrapped and regtested on s390x. Ok for trunk? Signed-off-by: Juergen Christ --- gcc/config/s390/s390.cc | 33 ++ gcc/config/s390/vx-builtins.md| 2 +- .../s390/vector/vec-expand-replicate.c| 60 +++ 3 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 372a23244032..3148f163627c 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -17923,6 +17923,36 @@ expand_perm_as_a_vlbr_vstbr_candidate (const struct expand_vec_perm_d &d) return false; } +static bool +expand_perm_as_replicate (const struct expand_vec_perm_d &d) +{ + unsigned char i; + unsigned char elem; + rtx base = d.op0; + rtx insn; + /* Needed to silence maybe-uninitialized warning. */ + gcc_assert (d.nelt > 0); + elem = d.perm[0]; + for (i = 1; i < d.nelt; ++i) +if (d.perm[i] != elem) + return false; + if (!d.testing_p) +{ + if (elem >= d.nelt) + { + base = d.op1; + elem -= d.nelt; + } + insn = maybe_gen_vec_splat (d.vmode, d.target, base, GEN_INT (elem)); + if (insn == NULL_RTX) + return false; + emit_insn (insn); + return true; +} + else +return maybe_code_for_vec_splat (d.vmode) != CODE_FOR_nothing; +} + /* Try to find the best sequence for the vector permute operation described by D. Return true if the operation could be expanded. */ @@ -17941,6 +17971,9 @@ vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d) if (expand_perm_as_a_vlbr_vstbr_candidate (d)) return true; + if (expand_perm_as_replicate (d)) +return true; + return false; } diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md index 432d81a719fc..93c0d408a43e 100644 --- a/gcc/config/s390/vx-builtins.md +++ b/gcc/config/s390/vx-builtins.md @@ -424,7 +424,7 @@ ; Replicate from vector element -(define_expand "vec_splat" +(define_expand "@vec_splat" [(set (match_operand:V_HW 0 "register_operand" "") (vec_duplicate:V_HW (vec_select: (match_operand:V_HW 1 "register_operand" "") diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c new file mode 100644 index ..872b1c9321cd --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c @@ -0,0 +1,60 @@ +/* Check that the vectorize_vec_perm_const expander correctly deals with + replication. Extracted from spec "nab". */ + +/* { dg-do compile } */ +/* { dg-options "-O3 -mzarch -march=z13 -fvect-cost-model=unlimited" } */ + +typedef double POINT_T[3]; +typedef double MATRIX_T[][4]; +typedef struct { + POINT_T a_pos; +} ATOM_T; +typedef struct { + ATOM_T *r_atoms; +} RESIDUE_T; +typedef struct strand_t { + RESIDUE_T *s_residues; +} STRAND_T; +typedef struct strand_t MOLECULE_T; +double xfm_xyz_oxyz4[4]; +MOLECULE_T add_he2o3transformmol_mol, add_he2o3transformmol_sp; +RESIDUE_T add_he2o3transformmol_res; +int add_he2o3transformmol_r, add_he2o3transformmol_a, add_he2o3transformmol_i; +ATOM_T *add_he2o3transformmol_ap; +POINT_T add_he2o3transformmol_xyz, add_he2o3transformmol_nxyz; +static void xfm_xyz(POINT_T oxyz, MATRIX_T mat, POINT_T nxyz) { + int i, j; + double nxyz4[4]; + for (i = 0; i < 3; i++) +xfm_xyz_oxyz4[i] = oxyz[i]; + xfm_xyz_oxyz4[3] = 1.0; + for (i = 0; i < 4; i++) { +nxyz4[i] = 0.0; +for (j = 0; j < 4; j++) + nxyz4[i] += xfm_xyz_oxyz4[j] * mat[j][i]; + } + for (i = 0; i < 3; i++) +nxyz[i] = nxyz4[i]; +} +void add_he2o3transformmol(MATRIX_T mat, int n) { + for (add_he2o3transformmol_sp = add_he2o3transformmol_mol;;) +for (add_he2o3transformmol_r = 0;;) { + add_he2o3transformmol_res = + add_he2o3transformmol_sp.s_residues[add_he2o3transformmol_r]; + for (add_he2o3transformmol_a = 0; add_he2o3transformmol_a < n; add_he2o3transformmol_a++) { +add_he2o3transformmol_ap = +&add_he2o3transformmol_res.r_atoms[add_he2o3transformmol_a]; +for (add_he2o3transformmol_i = 0; add_he2o3transformmol_i < 3; + add_he2o3transformmol_i++) + add_he2o3transformm
Re: [PATCH v2] s390x: Optimize vector permute with constant indexes
Am Tue, Apr 09, 2024 at 05:01:18PM +0200 schrieb Andreas Krebbel: > On 4/9/24 16:31, Juergen Christ wrote: > > Loop vectorizer can generate vector permutes with constant indexes > > where all indexes are equal. Optimize this case to use vector > > replicate instead of vector permute. > > > > gcc/ChangeLog: > > > > * config/s390/s390.cc (expand_perm_as_replicate): Implement. > > (vectorize_vec_perm_const_1): Call new function. > > * config/s390/vx-builtins.md (vec_splat): Change to... > > (@vec_splat): ...this. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/s390/vector/vec-expand-replicate.c: New test. > > > > Bootstrapped and regtested on s390x. Ok for trunk? > > Does this also work when using the vec_perm intrinsic or would we need to > define a matching RTX for > that? Unfortunately, it does not work with vec_perm. > Ok. Thanks! Pushed. Juergen
[PATCH] Do not emulate vectors containing floats.
Fixes various test failures on s390x. gcc/ChangeLog: * tree-vect-stmts.cc (vectorizable_operation): Don't emulate floating point vectors Signed-off-by: Juergen Christ Regtested and bootstrapped on x86_64-pc-linux-gnu and s390x-ibm-linux-gnu. Okay for trunk? --- gcc/tree-vect-stmts.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 09749ae38174..4164f254fd6e 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -6756,7 +6756,8 @@ vectorizable_operation (vec_info *vinfo, those through even when the mode isn't word_mode. For ops we have to lower the lowering code assumes we are dealing with word_mode. */ - if code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR) + if (FLOAT_MODE_P (vec_mode) + || (((code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR) || !target_support_p) && maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)) /* Check only during analysis. */ -- 2.39.3
[PATCH v2] Do not emulate vectors containing floats.
The emulation via word mode tries to perform integer arithmetic on floating point values instead of floating point arithmetic. This leads to mis-compilations. Failure occured on s390x on these existing test cases: gcc.dg/vect/tsvc/vect-tsvc-s112.c gcc.dg/vect/tsvc/vect-tsvc-s113.c gcc.dg/vect/tsvc/vect-tsvc-s119.c gcc.dg/vect/tsvc/vect-tsvc-s121.c gcc.dg/vect/tsvc/vect-tsvc-s131.c gcc.dg/vect/tsvc/vect-tsvc-s132.c gcc.dg/vect/tsvc/vect-tsvc-s2233.c gcc.dg/vect/tsvc/vect-tsvc-s421.c gcc.dg/vect/vect-alias-check-14.c gcc.target/s390/vector/partial/s390-vec-length-epil-run-1.c gcc.target/s390/vector/partial/s390-vec-length-epil-run-3.c gcc.target/s390/vector/partial/s390-vec-length-full-run-3.c gcc/ChangeLog: * tree-vect-stmts.cc (vectorizable_operation): Don't emulate floating point vectors Signed-off-by: Juergen Christ --- gcc/tree-vect-stmts.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 09749ae38174..f95ff2c2aa34 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -6756,7 +6756,8 @@ vectorizable_operation (vec_info *vinfo, those through even when the mode isn't word_mode. For ops we have to lower the lowering code assumes we are dealing with word_mode. */ - if code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR) + if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)) + || (((code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR) || !target_support_p) && maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)) /* Check only during analysis. */ -- 2.39.3
Re: [PATCH v2] Do not emulate vectors containing floats.
Am Fri, Feb 23, 2024 at 01:57:12PM + schrieb Sam James: > > Juergen Christ writes: > > > The emulation via word mode tries to perform integer arithmetic on floating > > point values instead of floating point arithmetic. This leads to > > mis-compilations. > > Is the bug ref + test missing? Sorry, forgot to add the "bootstrapped and tested on s390x and x86_64". Not sure how to reference a bugzilla here. There is 114075 that should be solved with this, too. > > > > Failure occured on s390x on these existing test cases: > > gcc.dg/vect/tsvc/vect-tsvc-s112.c > > gcc.dg/vect/tsvc/vect-tsvc-s113.c > > gcc.dg/vect/tsvc/vect-tsvc-s119.c > > gcc.dg/vect/tsvc/vect-tsvc-s121.c > > gcc.dg/vect/tsvc/vect-tsvc-s131.c > > gcc.dg/vect/tsvc/vect-tsvc-s132.c > > gcc.dg/vect/tsvc/vect-tsvc-s2233.c > > gcc.dg/vect/tsvc/vect-tsvc-s421.c > > gcc.dg/vect/vect-alias-check-14.c > > gcc.target/s390/vector/partial/s390-vec-length-epil-run-1.c > > gcc.target/s390/vector/partial/s390-vec-length-epil-run-3.c > > gcc.target/s390/vector/partial/s390-vec-length-full-run-3.c > > > > gcc/ChangeLog: > > > > * tree-vect-stmts.cc (vectorizable_operation): Don't emulate floating > > point vectors > > > > Signed-off-by: Juergen Christ > > --- > > gcc/tree-vect-stmts.cc | 3 ++- > > 1 file changed, 2 insertions(+), 1 deletion(-) > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > index 09749ae38174..f95ff2c2aa34 100644 > > --- a/gcc/tree-vect-stmts.cc > > +++ b/gcc/tree-vect-stmts.cc > > @@ -6756,7 +6756,8 @@ vectorizable_operation (vec_info *vinfo, > > those through even when the mode isn't word_mode. For > > ops we have to lower the lowering code assumes we are > > dealing with word_mode. */ > > - if code == PLUS_EXPR || code == MINUS_EXPR || code == > > NEGATE_EXPR) > > + if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)) > > + || (((code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR) > > || !target_support_p) > >&& maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)) > > /* Check only during analysis. */ >
[PATCH] Add myself to write after approval and DCO.
Hello, I have added myself to write after approval and DCO. Thanks, Juergen Christ ChangeLog: * MAINTAINERS: Add myself to write after approval and DCO. Signed-off-by: Juergen Christ --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index cb5a42501dd2..ca6a27b4c11b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -375,6 +375,7 @@ Dehao Chen Fabien Chêne Clément Chigot Harshit Chopra +Juergen Christ Tamar Christina Eric Christopher Paul Clarke @@ -756,6 +757,7 @@ Certificate of Origin Version 1.1. See https://gcc.gnu.org/dco.html for more information. +Juergen Christ Robin Dapp Robin Dapp Michal Jires -- 2.39.3
[PATCH] s390x: Fix PR112753
Commit 466b100e5fee808d77598e0f294654deec281150 introduced a bug in s390_md_asm_adjust if vector extensions are not available. Fix the control flow of this function to not adjust long double values. gcc/ChangeLog: * config/s390/s390.cc (s390_md_asm_adjust): Fix. gcc/testsuite/ChangeLog: * gcc.target/s390/pr112753.c: New test. Bootstrapped and tested on s390x. Signed-off-by: Juergen Christ --- gcc/config/s390/s390.cc | 4 gcc/testsuite/gcc.target/s390/pr112753.c | 8 2 files changed, 12 insertions(+) create mode 100644 gcc/testsuite/gcc.target/s390/pr112753.c diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 29b5dc979207..3a4d2d346f0c 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -17604,6 +17604,10 @@ s390_md_asm_adjust (vec &outputs, vec &inputs, outputs[i] = fprx2; } + if (!TARGET_VXE) +/* Long doubles are stored in FPR pairs - nothing left to do. */ +return after_md_seq; + for (unsigned i = 0; i < ninputs; i++) { if (GET_MODE (inputs[i]) != TFmode) diff --git a/gcc/testsuite/gcc.target/s390/pr112753.c b/gcc/testsuite/gcc.target/s390/pr112753.c new file mode 100644 index ..7183b3f12bed --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/pr112753.c @@ -0,0 +1,8 @@ +/* This caused an ICE on s390x due to a bug in s390_md_asm_adjust when no + vector extension is available. */ + +/* { dg-do compile } */ +/* { dg-options "-O2 -march=zEC12" } */ + +long double strtold_l_internal___x; +void strtold_l_internal() { __asm__("" : : "fm"(strtold_l_internal___x)); } -- 2.39.3
[PATCH] s390: Fix ICE in testcase pr89233
When using GNU vector extensions, an access outside of the vector size caused an ICE on s390. Fix this by aligning with the vec_extract builtin, i.e., computing constant index modulo number of lanes. Fixes testcase gcc.target/s390/pr89233.c. Bootstrapped and tested on s390. OK for mainline? gcc/ChangeLog: * config/s390/vector.md: (*vec_extract) Fix. Signed-off-by: Juergen Christ --- gcc/config/s390/vector.md | 14 -- 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index 7d1eb36e8446..deda5990a035 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -532,12 +532,14 @@ (match_operand:V1 "nonmemory_operand" "v,v") (parallel [(match_operand:SI 2 "nonmemory_operand" "an,I")])))] - "TARGET_VX - && (!CONST_INT_P (operands[2]) - || UINTVAL (operands[2]) < GET_MODE_NUNITS (mode))" - "@ - vlgv\t%0,%v1,%Y2 - vste\t%v1,%0,%2" + "TARGET_VX" + { +if (CONST_INT_P (operands[2])) + operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS (mode) - 1)); +if (which_alternative == 0) + return "vlgv\t%0,%v1,%Y2"; + return "vste\t%v1,%0,%2"; + } [(set_attr "op_type" "VRS,VRX")]) ; vlgvb, vlgvh, vlgvf, vlgvg -- 2.39.3
[PATCH] s390: split int128 load
Issue two loads when using GPRs instead of one load-multiple. Bootstrapped and tested on s390. OK for mainline? gcc/ChangeLog: * config/s390/s390.md: Split TImode loads. gcc/testsuite/ChangeLog: * gcc.target/s390/int128load.c: New test. Signed-off-by: Juergen Christ --- gcc/config/s390/s390.md| 4 gcc/testsuite/gcc.target/s390/int128load.c | 14 ++ 2 files changed, 14 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/int128load.c diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index 3f29ba214427..5bff69aeb350 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -1687,8 +1687,6 @@ [(set (match_operand:TI 0 "nonimmediate_operand" "") (match_operand:TI 1 "general_operand" ""))] "TARGET_ZARCH && reload_completed - && !s_operand (operands[0], TImode) - && !s_operand (operands[1], TImode) && s390_split_ok_p (operands[0], operands[1], TImode, 0)" [(set (match_dup 2) (match_dup 4)) (set (match_dup 3) (match_dup 5))] @@ -1703,8 +1701,6 @@ [(set (match_operand:TI 0 "nonimmediate_operand" "") (match_operand:TI 1 "general_operand" ""))] "TARGET_ZARCH && reload_completed - && !s_operand (operands[0], TImode) - && !s_operand (operands[1], TImode) && s390_split_ok_p (operands[0], operands[1], TImode, 1)" [(set (match_dup 2) (match_dup 4)) (set (match_dup 3) (match_dup 5))] diff --git a/gcc/testsuite/gcc.target/s390/int128load.c b/gcc/testsuite/gcc.target/s390/int128load.c new file mode 100644 index ..35d5380704b6 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/int128load.c @@ -0,0 +1,14 @@ +/* Check that int128 loads and stores are split. */ + +/* { dg-do compile } */ +/* { dg-options "-O3 -mzarch -march=zEC12" } */ + +__int128 global; + +void f(__int128 x) +{ + global = x; +} + +/* { dg-final { scan-assembler-times "lg\t" 2 } } */ +/* { dg-final { scan-assembler-times "stg\t" 2 } } */ -- 2.39.3
[PATCH] s390: implement flags output
Implement flags output for inline assemblies. Only use one output constraint that captures the whole condition code. No breakout into different condition codes is allowed. Also, only one condition code variable is allowed. Add further logic to canonicalize various cases where we combine different cases of possible condition codes. Bootstrapped and tested on s390. OK for mainline? gcc/ChangeLog: * config/s390/s390-c.cc (s390_cpu_cpp_builtins): Define __GCC_ASM_FLAG_OUTPUTS__. * config/s390/s390.cc (s390_canonicalize_comparison): More UNSPEC_CC_TO_INT cases. (s390_md_asm_adjust): Implement flags output. * config/s390/s390.md (ccstore4): Allow mask operands. * doc/extend.texi: Document flags output. gcc/testsuite/ChangeLog: * gcc.target/s390/ccor.c: New test. Signed-off-by: Juergen Christ --- gcc/config/s390/s390-c.cc| 1 + gcc/config/s390/s390.cc | 139 ++- gcc/config/s390/s390.md | 8 +- gcc/doc/extend.texi | 5 + gcc/testsuite/gcc.target/s390/ccor.c | 88 + 5 files changed, 232 insertions(+), 9 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/ccor.c diff --git a/gcc/config/s390/s390-c.cc b/gcc/config/s390/s390-c.cc index 269f4f8e978d..c126e6d323d7 100644 --- a/gcc/config/s390/s390-c.cc +++ b/gcc/config/s390/s390-c.cc @@ -409,6 +409,7 @@ s390_cpu_cpp_builtins (cpp_reader *pfile) cpp_define (pfile, "__LONG_DOUBLE_128__"); cl_target_option_save (&opts, &global_options, &global_options_set); s390_cpu_cpp_builtins_internal (pfile, &opts, NULL); + cpp_define (pfile, "__GCC_ASM_FLAG_OUTPUTS__"); } #if S390_USE_TARGET_ATTRIBUTE diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 61c5f88de8af..a19dd7849b84 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -1877,6 +1877,97 @@ s390_canonicalize_comparison (int *code, rtx *op0, rtx *op1, *code = new_code; } } + /* Remove UNSPEC_CC_TO_INT from connectives. This happens for + checks against multiple condition codes. */ + if (GET_CODE (*op0) == AND + && GET_CODE (XEXP (*op0, 0)) == UNSPEC + && XINT (XEXP (*op0, 0), 1) == UNSPEC_CC_TO_INT + && XVECLEN (XEXP (*op0, 0), 0) == 1 + && REGNO (XVECEXP (XEXP (*op0, 0), 0, 0)) == CC_REGNUM + && CONST_INT_P (XEXP (*op0, 1)) + && CONST_INT_P (*op1) + && INTVAL (XEXP (*op0, 1)) == -3 + && *code == EQ) +{ + if (INTVAL (*op1) == 0) + { + /* case cc == 0 || cc = 2 => mask = 0xa */ + *op0 = XVECEXP (XEXP (*op0, 0), 0, 0); + *op1 = gen_rtx_CONST_INT (VOIDmode, 0xa); + } + else if (INTVAL (*op1) == 1) + { + /* case cc == 1 || cc == 3 => mask = 0x5 */ + *op0 = XVECEXP (XEXP (*op0, 0), 0, 0); + *op1 = gen_rtx_CONST_INT (VOIDmode, 0x5); + } +} + if (GET_CODE (*op0) == PLUS + && GET_CODE (XEXP (*op0, 0)) == UNSPEC + && XINT (XEXP (*op0, 0), 1) == UNSPEC_CC_TO_INT + && XVECLEN (XEXP (*op0, 0), 0) == 1 + && REGNO (XVECEXP (XEXP (*op0, 0), 0, 0)) == CC_REGNUM + && CONST_INT_P (XEXP (*op0, 1)) + && CONST_INT_P (*op1) + && (*code == LEU || *code == GTU)) +{ + if (INTVAL (*op1) == 1) + { + if (INTVAL (XEXP (*op0, 1)) == -1) + { + /* case cc == 1 || cc == 2 => mask = 0x6 */ + *op0 = XVECEXP (XEXP (*op0, 0), 0, 0); + *op1 = gen_rtx_CONST_INT (VOIDmode, 0x6); + *code = *code == GTU ? NE : EQ; + } + else if (INTVAL (XEXP (*op0, 1)) == -2) + { + /* case cc == 2 || cc == 3 => mask = 0x3 */ + *op0 = XVECEXP (XEXP (*op0, 0), 0, 0); + *op1 = gen_rtx_CONST_INT (VOIDmode, 0x3); + *code = *code == GTU ? NE : EQ; + } + } + else if (INTVAL (*op1) == 2 + && INTVAL (XEXP (*op0, 1)) == -1) + { + /* case cc == 1 || cc == 2 || cc == 3 => mask = 0x7 */ + *op0 = XVECEXP (XEXP (*op0, 0), 0, 0); + *op1 = gen_rtx_CONST_INT (VOIDmode, 0x7); + *code = *code == GTU ? NE : EQ; + } +} + else if (*code == LEU || *code == GTU) +{ + if (GET_CODE (*op0) == UNSPEC + && XINT (*op0, 1) == UNSPEC_CC_TO_INT + && XVECLEN (*op0, 0) == 1 + && REGNO (XVECEXP (*op0, 0, 0)) == CC_REGNUM + && CONST_INT_P (*op1)) + { + if (INTVAL (*op1) == 1) + { + /* case cc == 0 || cc == 1 => mask = 0xc */ + *op0 = XVECEXP (*op0, 0, 0); + *op1 = gen_rtx_CONST_INT (VOIDmode, 0xc); + *code = *code =
Re: [PATCH] vect: Multistep float->int conversion only with no trapping math
Am Thu, Aug 08, 2024 at 02:06:44PM +0200 schrieb Richard Biener: > On Mon, Aug 5, 2024 at 4:02 PM Juergen Christ wrote: > > > > Am Mon, Aug 05, 2024 at 01:00:31PM +0200 schrieb Richard Biener: > > > On Fri, Aug 2, 2024 at 2:43 PM Juergen Christ > > > wrote: > > > > > > > > Do not convert floats to ints in multiple step if trapping math is > > > > enabled. This might hide some inexact signals. > > > > > > > > Also use correct sign (the sign of the target integer type) for the > > > > intermediate steps. This only affects undefined behaviour (casting > > > > floats to unsigned datatype where the float is negative). > > > > > > > > gcc/ChangeLog: > > > > > > > > * tree-vect-stmts.cc (vectorizable_conversion): multi-step > > > > float to int conversion only with trapping math and correct > > > > sign. > > > > > > > > Signed-off-by: Juergen Christ > > > > > > > > Bootstrapped and tested on x84 and s390. Ok for trunk? > > > > > > > > --- > > > > gcc/tree-vect-stmts.cc | 8 +--- > > > > 1 file changed, 5 insertions(+), 3 deletions(-) > > > > > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > > > index fdcda0d2abae..2ddd13383193 100644 > > > > --- a/gcc/tree-vect-stmts.cc > > > > +++ b/gcc/tree-vect-stmts.cc > > > > @@ -5448,7 +5448,8 @@ vectorizable_conversion (vec_info *vinfo, > > > > break; > > > > > > > > cvt_type > > > > - = build_nonstandard_integer_type (GET_MODE_BITSIZE > > > > (rhs_mode), 0); > > > > + = build_nonstandard_integer_type (GET_MODE_BITSIZE > > > > (rhs_mode), > > > > + TYPE_UNSIGNED (lhs_type)); > > > > > > But lhs_type should be a float type here, the idea that for a > > > FLOAT_EXPR (int -> float) > > > a signed integer type is the natural one to use - as it's 2x wider > > > than the original > > > RHS type it's signedness doesn't matter. Note all float types should be > > > !TYPE_UNSIGNED so this hunk is a no-op but still less clear on the intent > > > IMO. > > > > > > Please drop it. > > > > Will do. Sorry about that. > > > > > > cvt_type = get_same_sized_vectype (cvt_type, vectype_in); > > > > if (cvt_type == NULL_TREE) > > > > goto unsupported; > > > > @@ -5505,10 +5506,11 @@ vectorizable_conversion (vec_info *vinfo, > > > >if (GET_MODE_SIZE (lhs_mode) >= GET_MODE_SIZE (rhs_mode)) > > > > goto unsupported; > > > > > > > > - if (code == FIX_TRUNC_EXPR) > > > > + if (code == FIX_TRUNC_EXPR && !flag_trapping_math) > > > > { > > > > cvt_type > > > > - = build_nonstandard_integer_type (GET_MODE_BITSIZE > > > > (rhs_mode), 0); > > > > + = build_nonstandard_integer_type (GET_MODE_BITSIZE > > > > (rhs_mode), > > > > + TYPE_UNSIGNED (lhs_type)); > > > > > > Here it might be relevant for correctness - we have to choose between > > > sfix and ufix for the float -> [u]int conversion. > > > > > > Do you have a testcase? Shouldn't the exactness be independent of the > > > integer > > > type we convert to? > > > > I was looking at this little program which contains undefined behaviour: > > > > #include > > > > __attribute__((noinline,noclone,noipa)) > > void > > vec_pack_ufix_trunc_v2df (double *in, unsigned int *out); > > > > void > > vec_pack_ufix_trunc_v2df (double *in, unsigned int *out) > > { > > out[0] = in[0]; > > out[1] = in[1]; > > out[2] = in[2]; > > out[3] = in[3]; > > } > > > > int main() > > { > > double in[] = {-1,-2,-3,-4}; > > unsigned int out[4]; > > > > vec_pack_ufix_trunc_v2df (in, out); > > for (int i = 0; i < 4; ++i) > > printf("out[%d] = %u\n", i, out[i]); > > return 0; > > } > > > > On s390x, I get different results after vectorization: > > > > out[0]
Re: [PATCH] vect: Multistep float->int conversion only with no trapping math
Am Tue, Aug 20, 2024 at 10:15:22AM +0200 schrieb Richard Biener: > On Fri, Aug 9, 2024 at 2:58 PM Juergen Christ wrote: > > > > Am Thu, Aug 08, 2024 at 02:06:44PM +0200 schrieb Richard Biener: > > > On Mon, Aug 5, 2024 at 4:02 PM Juergen Christ > > > wrote: > > > > > > > > Am Mon, Aug 05, 2024 at 01:00:31PM +0200 schrieb Richard Biener: > > > > > On Fri, Aug 2, 2024 at 2:43 PM Juergen Christ > > > > > wrote: > > > > > > > > > > > > Do not convert floats to ints in multiple step if trapping math is > > > > > > enabled. This might hide some inexact signals. > > > > > > > > > > > > Also use correct sign (the sign of the target integer type) for the > > > > > > intermediate steps. This only affects undefined behaviour (casting > > > > > > floats to unsigned datatype where the float is negative). > > > > > > > > > > > > gcc/ChangeLog: > > > > > > > > > > > > * tree-vect-stmts.cc (vectorizable_conversion): multi-step > > > > > > float to int conversion only with trapping math and > > > > > > correct > > > > > > sign. > > > > > > > > > > > > Signed-off-by: Juergen Christ > > > > > > > > > > > > Bootstrapped and tested on x84 and s390. Ok for trunk? > > > > > > > > > > > > --- > > > > > > gcc/tree-vect-stmts.cc | 8 +--- > > > > > > 1 file changed, 5 insertions(+), 3 deletions(-) > > > > > > > > > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > > > > > index fdcda0d2abae..2ddd13383193 100644 > > > > > > --- a/gcc/tree-vect-stmts.cc > > > > > > +++ b/gcc/tree-vect-stmts.cc > > > > > > @@ -5448,7 +5448,8 @@ vectorizable_conversion (vec_info *vinfo, > > > > > > break; > > > > > > > > > > > > cvt_type > > > > > > - = build_nonstandard_integer_type (GET_MODE_BITSIZE > > > > > > (rhs_mode), 0); > > > > > > + = build_nonstandard_integer_type (GET_MODE_BITSIZE > > > > > > (rhs_mode), > > > > > > + TYPE_UNSIGNED > > > > > > (lhs_type)); > > > > > > > > > > But lhs_type should be a float type here, the idea that for a > > > > > FLOAT_EXPR (int -> float) > > > > > a signed integer type is the natural one to use - as it's 2x wider > > > > > than the original > > > > > RHS type it's signedness doesn't matter. Note all float types should > > > > > be > > > > > !TYPE_UNSIGNED so this hunk is a no-op but still less clear on the > > > > > intent IMO. > > > > > > > > > > Please drop it. > > > > > > > > Will do. Sorry about that. > > > > > > > > > > cvt_type = get_same_sized_vectype (cvt_type, vectype_in); > > > > > > if (cvt_type == NULL_TREE) > > > > > > goto unsupported; > > > > > > @@ -5505,10 +5506,11 @@ vectorizable_conversion (vec_info *vinfo, > > > > > >if (GET_MODE_SIZE (lhs_mode) >= GET_MODE_SIZE (rhs_mode)) > > > > > > goto unsupported; > > > > > > > > > > > > - if (code == FIX_TRUNC_EXPR) > > > > > > + if (code == FIX_TRUNC_EXPR && !flag_trapping_math) > > > > > > { > > > > > > cvt_type > > > > > > - = build_nonstandard_integer_type (GET_MODE_BITSIZE > > > > > > (rhs_mode), 0); > > > > > > + = build_nonstandard_integer_type (GET_MODE_BITSIZE > > > > > > (rhs_mode), > > > > > > + TYPE_UNSIGNED > > > > > > (lhs_type)); > > > > > > > > > > Here it might be relevant for correctness - we have to choose between > > > > > sfix and ufix for the float -> [u]int conversion. > > > > > > > > > > Do you have a testcase? Shouldn't the exactness be independent of > > > > >
Re: [PATCH] vect: Multistep float->int conversion only with no trapping math
Am Tue, Aug 20, 2024 at 02:51:02PM +0200 schrieb Richard Biener: > On Tue, Aug 20, 2024 at 11:16 AM Juergen Christ wrote: > > > > Am Tue, Aug 20, 2024 at 10:15:22AM +0200 schrieb Richard Biener: > > > On Fri, Aug 9, 2024 at 2:58 PM Juergen Christ > > > wrote: > > > > > > > > Am Thu, Aug 08, 2024 at 02:06:44PM +0200 schrieb Richard Biener: > > > > > On Mon, Aug 5, 2024 at 4:02 PM Juergen Christ > > > > > wrote: > > > > > > > > > > > > Am Mon, Aug 05, 2024 at 01:00:31PM +0200 schrieb Richard Biener: > > > > > > > On Fri, Aug 2, 2024 at 2:43 PM Juergen Christ > > > > > > > wrote: > > > > > > > > > > > > > > > > Do not convert floats to ints in multiple step if trapping math > > > > > > > > is > > > > > > > > enabled. This might hide some inexact signals. > > > > > > > > > > > > > > > > Also use correct sign (the sign of the target integer type) for > > > > > > > > the > > > > > > > > intermediate steps. This only affects undefined behaviour > > > > > > > > (casting > > > > > > > > floats to unsigned datatype where the float is negative). > > > > > > > > > > > > > > > > gcc/ChangeLog: > > > > > > > > > > > > > > > > * tree-vect-stmts.cc (vectorizable_conversion): > > > > > > > > multi-step > > > > > > > > float to int conversion only with trapping math and > > > > > > > > correct > > > > > > > > sign. > > > > > > > > > > > > > > > > Signed-off-by: Juergen Christ > > > > > > > > > > > > > > > > Bootstrapped and tested on x84 and s390. Ok for trunk? > > > > > > > > > > > > > > > > --- > > > > > > > > gcc/tree-vect-stmts.cc | 8 +--- > > > > > > > > 1 file changed, 5 insertions(+), 3 deletions(-) > > > > > > > > > > > > > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > > > > > > > index fdcda0d2abae..2ddd13383193 100644 > > > > > > > > --- a/gcc/tree-vect-stmts.cc > > > > > > > > +++ b/gcc/tree-vect-stmts.cc > > > > > > > > @@ -5448,7 +5448,8 @@ vectorizable_conversion (vec_info *vinfo, > > > > > > > > break; > > > > > > > > > > > > > > > > cvt_type > > > > > > > > - = build_nonstandard_integer_type (GET_MODE_BITSIZE > > > > > > > > (rhs_mode), 0); > > > > > > > > + = build_nonstandard_integer_type (GET_MODE_BITSIZE > > > > > > > > (rhs_mode), > > > > > > > > + TYPE_UNSIGNED > > > > > > > > (lhs_type)); > > > > > > > > > > > > > > But lhs_type should be a float type here, the idea that for a > > > > > > > FLOAT_EXPR (int -> float) > > > > > > > a signed integer type is the natural one to use - as it's 2x wider > > > > > > > than the original > > > > > > > RHS type it's signedness doesn't matter. Note all float types > > > > > > > should be > > > > > > > !TYPE_UNSIGNED so this hunk is a no-op but still less clear on > > > > > > > the intent IMO. > > > > > > > > > > > > > > Please drop it. > > > > > > > > > > > > Will do. Sorry about that. > > > > > > > > > > > > > > cvt_type = get_same_sized_vectype (cvt_type, > > > > > > > > vectype_in); > > > > > > > > if (cvt_type == NULL_TREE) > > > > > > > > goto unsupported; > > > > > > > > @@ -5505,10 +5506,11 @@ vectorizable_conversion (vec_info > > > > > > > > *vinfo, > > > > > > > >if (GET_MODE_SIZE (lhs_mode) >= GET_MODE
[PATCH] s390: define single step vector casts
Some casts were missing leading to missed of bad vectorizations where casting was done scalar followed by a vector creation from the individual elements. gcc/ChangeLog: * config/s390/vector.md (VEC_HALF_NARROWED): New mode iterator. (vec_half_narrowed): ditto. (trunc2): New pattern. (vec_pack_ufix_trunc_v2df): ditto. (vec_pack_sfix_trunc_v2df): ditto. (vec_unpack_sfix_trunc_lo_v4sf): ditto. (vec_unpack_sfix_trunc_hi_v4sf): ditto. (vec_unpack_ufix_trunc_lo_v4sf): ditto. (vec_unpack_ufix_trunc_hi_v4sf): ditto. (floatv2siv2sf2): ditto. (floatunsv2siv2sf2): ditto. (vec_unpacks_float_hi_v4si): ditto. (vec_unpacks_float_lo_v4si): ditto. (vec_unpacku_float_hi_v4si): ditto. (vec_unpacku_float_lo_v4si): ditto. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/vec-cast-single.c: New test. * gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c: New test. Bootstrapped and regtested on s390x. Ok for trunk? Signed-off-by: Juergen Christ --- gcc/config/s390/vector.md | 170 ++- .../gcc.target/s390/vector/vec-cast-single.c | 271 ++ .../s390/vector/vec_pack_ufix_trunc_v2df.c| 30 ++ 3 files changed, 463 insertions(+), 8 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cast-single.c create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec_pack_ufix_trunc_v2df.c diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index 40de0c75a7cf..356f25d26deb 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -89,6 +89,8 @@ (define_mode_iterator VI_EXTEND [V2QI V2HI V2SI V4QI V4HI]) +(define_mode_iterator VI_TRUNC [V2HI V2SI V2DI V4HI V4SI]) + ; Empty string for all but TImode. This is used to hide the TImode ; expander name in case it is defined already. See addti3 for an ; example. @@ -211,6 +213,14 @@ (V1SF "v1df") (V2SF "v2df") (V4SF "v4df") (V1DF "v1tf") (V2DF "v2tf")]) +; Vector with narrowed element size and the same number of elements. +(define_mode_attr VEC_HALF_NARROWED [(V1HI "V1QI") (V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI") + (V1SI "V1HI") (V2SI "V2HI") (V4SI "V4HI") + (V1DI "V1DI") (V2DI "V2SI")]) +(define_mode_attr vec_half_narrowed [(V1HI "v1qi") (V2HI "v2qi") (V4HI "v4qi") (V8HI "v8qi") + (V1SI "v1hi") (V2SI "v2hi") (V4SI "v4hi") + (V1DI "v1di") (V2DI "v2si")]) + ; Vector with half the element size AND half the number of elements. (define_mode_attr vec_halfhalf [(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI") @@ -2422,6 +2432,17 @@ operands[2] = gen_reg_rtx (V4SFmode); }) +;; vector truncate + +; downcasts + +(define_insn "trunc2" + [(set (match_operand: 0 "register_operand" "=v") +(truncate: (match_operand:VI_TRUNC 1 "register_operand" "v")))] + "TARGET_VX" + "vpk\t %0,%1,%1" + [(set_attr "op_type" "VRR")]) + ;; vector unpack v16qi ; signed @@ -3177,17 +3198,150 @@ emit_move_insn (len, gen_rtx_ZERO_EXTEND (SImode, operands[2])); emit_insn (gen_vstlv16qi (operands[1], len, mem)); DONE; -});; +}) + +(define_expand "vec_pack_ufix_trunc_v2df" + [(match_operand:V4SI 0 "register_operand") + (match_operand:V2DF 1 "register_operand") + (match_operand:V2DF 2 "register_operand")] + "TARGET_VX" +{ + rtx r1 = gen_reg_rtx (V2DImode); + rtx r2 = gen_reg_rtx (V2DImode); + + emit_insn (gen_fixuns_truncv2dfv2di2 (r1, operands[1])); + emit_insn (gen_fixuns_truncv2dfv2di2 (r2, operands[2])); + emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2)); + DONE; +}) + +(define_expand "vec_pack_sfix_trunc_v2df" + [(match_operand:V4SI 0 "register_operand") + (match_operand:V2DF 1 "register_operand") + (match_operand:V2DF 2 "register_operand")] + "TARGET_VX" +{ + rtx r1 = gen_reg_rtx (V2DImode); + rtx r2 = gen_reg_rtx (V2DImode); + + emit_insn (gen_fix_truncv2dfv2di2 (r1, operands[1])); + emit_insn (gen_fix_truncv2dfv2di2 (r2, operands[2])); + emit_insn (gen_vec_pack_trunc_v2di (operands[0], r1, r2)); + DONE; +}) + +; v4sf -> v2di +(define_expand "vec_unpack_sfix_trunc_lo_v4sf" + [(match_operand:V2DI 0 "register_operand") + (match_operand:V4SF 1 "register_operand")] + "TARGET_VX" +{ + rtx r = gen_reg_rtx(V4SImode); + + emit_insn (gen_fix_truncv4sfv4si2 (r, operands
[PATCH] vect: Multistep float->int conversion only with no trapping math
Do not convert floats to ints in multiple step if trapping math is enabled. This might hide some inexact signals. Also use correct sign (the sign of the target integer type) for the intermediate steps. This only affects undefined behaviour (casting floats to unsigned datatype where the float is negative). gcc/ChangeLog: * tree-vect-stmts.cc (vectorizable_conversion): multi-step float to int conversion only with trapping math and correct sign. Signed-off-by: Juergen Christ Bootstrapped and tested on x84 and s390. Ok for trunk? --- gcc/tree-vect-stmts.cc | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index fdcda0d2abae..2ddd13383193 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -5448,7 +5448,8 @@ vectorizable_conversion (vec_info *vinfo, break; cvt_type - = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), 0); + = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), + TYPE_UNSIGNED (lhs_type)); cvt_type = get_same_sized_vectype (cvt_type, vectype_in); if (cvt_type == NULL_TREE) goto unsupported; @@ -5505,10 +5506,11 @@ vectorizable_conversion (vec_info *vinfo, if (GET_MODE_SIZE (lhs_mode) >= GET_MODE_SIZE (rhs_mode)) goto unsupported; - if (code == FIX_TRUNC_EXPR) + if (code == FIX_TRUNC_EXPR && !flag_trapping_math) { cvt_type - = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), 0); + = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), + TYPE_UNSIGNED (lhs_type)); cvt_type = get_same_sized_vectype (cvt_type, vectype_in); if (cvt_type == NULL_TREE) goto unsupported; -- 2.43.5
Re: [PATCH] vect: Multistep float->int conversion only with no trapping math
Am Mon, Aug 05, 2024 at 01:00:31PM +0200 schrieb Richard Biener: > On Fri, Aug 2, 2024 at 2:43 PM Juergen Christ wrote: > > > > Do not convert floats to ints in multiple step if trapping math is > > enabled. This might hide some inexact signals. > > > > Also use correct sign (the sign of the target integer type) for the > > intermediate steps. This only affects undefined behaviour (casting > > floats to unsigned datatype where the float is negative). > > > > gcc/ChangeLog: > > > > * tree-vect-stmts.cc (vectorizable_conversion): multi-step > > float to int conversion only with trapping math and correct > > sign. > > > > Signed-off-by: Juergen Christ > > > > Bootstrapped and tested on x84 and s390. Ok for trunk? > > > > --- > > gcc/tree-vect-stmts.cc | 8 +--- > > 1 file changed, 5 insertions(+), 3 deletions(-) > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > index fdcda0d2abae..2ddd13383193 100644 > > --- a/gcc/tree-vect-stmts.cc > > +++ b/gcc/tree-vect-stmts.cc > > @@ -5448,7 +5448,8 @@ vectorizable_conversion (vec_info *vinfo, > > break; > > > > cvt_type > > - = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), > > 0); > > + = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), > > + TYPE_UNSIGNED (lhs_type)); > > But lhs_type should be a float type here, the idea that for a > FLOAT_EXPR (int -> float) > a signed integer type is the natural one to use - as it's 2x wider > than the original > RHS type it's signedness doesn't matter. Note all float types should be > !TYPE_UNSIGNED so this hunk is a no-op but still less clear on the intent IMO. > > Please drop it. Will do. Sorry about that. > > cvt_type = get_same_sized_vectype (cvt_type, vectype_in); > > if (cvt_type == NULL_TREE) > > goto unsupported; > > @@ -5505,10 +5506,11 @@ vectorizable_conversion (vec_info *vinfo, > >if (GET_MODE_SIZE (lhs_mode) >= GET_MODE_SIZE (rhs_mode)) > > goto unsupported; > > > > - if (code == FIX_TRUNC_EXPR) > > + if (code == FIX_TRUNC_EXPR && !flag_trapping_math) > > { > > cvt_type > > - = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), > > 0); > > + = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), > > + TYPE_UNSIGNED (lhs_type)); > > Here it might be relevant for correctness - we have to choose between > sfix and ufix for the float -> [u]int conversion. > > Do you have a testcase? Shouldn't the exactness be independent of the > integer > type we convert to? I was looking at this little program which contains undefined behaviour: #include __attribute__((noinline,noclone,noipa)) void vec_pack_ufix_trunc_v2df (double *in, unsigned int *out); void vec_pack_ufix_trunc_v2df (double *in, unsigned int *out) { out[0] = in[0]; out[1] = in[1]; out[2] = in[2]; out[3] = in[3]; } int main() { double in[] = {-1,-2,-3,-4}; unsigned int out[4]; vec_pack_ufix_trunc_v2df (in, out); for (int i = 0; i < 4; ++i) printf("out[%d] = %u\n", i, out[i]); return 0; } On s390x, I get different results after vectorization: out[0] = 4294967295 out[1] = 4294967294 out[2] = 4294967293 out[3] = 4294967292 than without vectorization: out[0] = 0 out[1] = 0 out[2] = 0 out[3] = 0 Even if this is undefined behaviour, I think it would be nice to have consistent results here. Also, while I added an expander to circumvent this problem in a previous patch, reviewers requested to hide this behind trapping math. Thus, I looked into this. Seeing the result from the CI for aarch64, I guess there are some tests that actually expect this vectorization to always happen even though it might not be save w.r.t. trapping math. > > > cvt_type = get_same_sized_vectype (cvt_type, vectype_in); > > if (cvt_type == NULL_TREE) > > goto unsupported; > > -- > > 2.43.5 > >
[PATCH] s390: Fix UNSPEC_CC_TO_INT canonicalization
Canonicalization of comparisons for UNSPEC_CC_TO_INT missed one case causing unnecessarily complex code. This especially seems to hit the Linux kernel. gcc/ChangeLog: * config/s390/s390.cc (s390_canonicalize_comparison): Add missing UNSPEC_CC_TO_INT case. gcc/testsuite/ChangeLog: * gcc.target/s390/ccusage.c: New test. Signed-off-by: Juergen Christ Bootstrapped and regression tested on s390. Okay for trunk? Okay to backport to GCC 14? --- gcc/config/s390/s390.cc | 2 +- gcc/testsuite/gcc.target/s390/ccusage.c | 37 + 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/s390/ccusage.c diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 25d43ae3e138..c36c33ff8280 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -1859,7 +1859,7 @@ s390_canonicalize_comparison (int *code, rtx *op0, rtx *op1, && CONST_INT_P (XEXP (*op0, 1)) && CONST_INT_P (*op1) && INTVAL (XEXP (*op0, 1)) == -3 - && *code == EQ) + && (*code == EQ || *code == NE)) { if (INTVAL (*op1) == 0) { diff --git a/gcc/testsuite/gcc.target/s390/ccusage.c b/gcc/testsuite/gcc.target/s390/ccusage.c new file mode 100644 index ..e25f712e25ca --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/ccusage.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=zEC12 -mzarch" } */ + +static __attribute__((always_inline)) inline +int __atomic_dec_and_test(int *ptr) +{ +int cc; +asm volatile( +" alsi%[ptr],-1\n" +: "=@cc" (cc), [ptr] "+QS" (*ptr) : : "memory"); +return (cc == 0) || (cc == 2); +} + +int a; +void dummy(void); +long fu(void) +{ +if (__atomic_dec_and_test(&a)) +return 5; +return 8; +} + +void bar(void) +{ +if (__atomic_dec_and_test(&a)) +dummy(); +} + +int foo(int x) +{ +int cc; +asm volatile ("ahi %[x],42\n" +: [x] "+d" (x), "=@cc" (cc)); +return !(cc == 0 || cc == 2) ? 42 : 13; +} + +/* { dg-final { scan-assembler-not {ipm} } } */ -- 2.43.5
[PATCH] s390: fix delegitimization of addresses
In legitimize_pic_address we create a (const (unspec ... UNSPEC_GOTENT)) in the GOT offset might be >= 4k. However, the s390_delegitimize_address does not contain a case for this scenario. gcc/ChangeLog: * config/s390/s390.cc (s390_delegitimize_address): Add missing case. gcc/testsuite/ChangeLog: * gcc.target/s390/delegitimize.c: New test. Bootstrapped and tested on s390x. Okay for trunk? Signed-off-by: Juergen Christ --- gcc/config/s390/s390.cc | 15 + gcc/testsuite/gcc.target/s390/delegitimize.c | 22 2 files changed, 37 insertions(+) create mode 100644 gcc/testsuite/gcc.target/s390/delegitimize.c diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 29aef501fdd2..535659ee5181 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -8218,6 +8218,21 @@ s390_delegitimize_address (rtx orig_x) return plus_constant (Pmode, XVECEXP (y, 0, 0), offset); } + if (GET_CODE (x) == CONST) +{ + /* Extract the symbol ref from: +(const:DI (unspec:DI [(symbol_ref:DI ("foo"))] + UNSPEC_PLT/GOTENT)) */ + + y = XEXP (x, 0); + if (GET_CODE (y) == UNSPEC + && (XINT (y, 1) == UNSPEC_GOTENT + || XINT (y, 1) == UNSPEC_PLT31)) + return XVECEXP (y, 0, 0); + else + return orig_x; +} + if (GET_CODE (x) != MEM) return orig_x; diff --git a/gcc/testsuite/gcc.target/s390/delegitimize.c b/gcc/testsuite/gcc.target/s390/delegitimize.c new file mode 100644 index ..bf143745dca2 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/delegitimize.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-nostdinc -std=gnu11 -fshort-wchar -funsigned-char -fno-common -fno-PIE -fno-strict-aliasing -m64 -fPIC -mpacked-stack -mbackchain -msoft-float -march=z13 -mtune=z13 -mindirect-branch=thunk-extern -mfunction-return=thunk-extern -mindirect-branch-table -DCC_USING_EXPOLINE -pipe -Wno-sign-compare -fno-asynchronous-unwind-tables -DCONFIG_AS_CFI_VAL_OFFSET=1 -fno-delete-null-pointer-checks -O2 -fno-allow-store-data-races -fno-stack-protector -ftrivial-auto-var-init=zero -fno-stack-clash-protection -pg -mrecord-mcount -mnop-mcount -mfentry -fno-inline-functions-called-once -fmin-function-alignment=8 -fstrict-flex-arrays=3 -fno-strict-overflow -fno-stack-check -fconserve-stack -Wall -Wundef -Werror=implicit-function-declaration -Werror=implicit-int -Werror=return-type -Werror=strict-prototypes -Wno-format-security -Wno-trigraphs -Wno-frame-address -Wno-address-of-packed-member -Wmissing-declarations -Wmissing-prototypes -Wframe-larger-than=2048 -Wno-main -Wno-dangling-pointer -Wvla -Wno-pointer-sign -Wcast-function-type -Wno-stringop-overflow -Wno-array-bounds -Wno-alloc-size-larger-than -Wimplicit-fallthrough=5 -Werror=date-time -Werror=incompatible-pointer-types -Werror=designated-init -Wenum-conversion -Wextra -Wunused -Wno-unused-but-set-variable -Wno-unused-const-variable -Wno-packed-not-aligned -Wno-format-overflow -Wno-format-truncation -Wno-stringop-truncation -Wno-override-init -Wno-missing-field-initializers -Wno-type-limits -Wno-shift-negative-value -Wno-maybe-uninitialized -Wno-sign-compare -Wno-unused-parameter -g -gdwarf-4 -fdump-rtl-final-details" } */ + +struct sk_buff { + struct { +struct { + struct { +int inner_ipproto; + }; +}; + }; +}; +void skb_udp_tunnel_segment(struct sk_buff *skb); +const int *inet_offloads[42], *inet6_offloads[42]; +_Bool skb_udp_tunnel_segment_is_ipv6; +void skb_udp_tunnel_segment(struct sk_buff *skb) { + const int **offloads = + skb_udp_tunnel_segment_is_ipv6 ? inet6_offloads : inet_offloads; + *(volatile typeof(_Generic(0, default : 0)) *)&offloads[skb->inner_ipproto]; +} + +/* { dg-final { scan-rtl-dump-not "Failed to expand as dwarf:" "final" } } */ -- 2.43.5
[PATCH] s390: Optimize vec_cmpge followed by vec_sel
A vec_cmpge produces a negation. Replace this negation by swapping the two selection choices of a vec_sel based on the result of the vec_cmpge. Bootstrapped and regression tested on s390x. gcc/ChangeLog: * config/s390/vx-builtins.md: New vsel pattern. gcc/testsuite/ChangeLog: * gcc.target/s390/vector/vec-cmpge.c: New test. Signed-off-by: Juergen Christ --- gcc/config/s390/vx-builtins.md | 11 +++ .../gcc.target/s390/vector/vec-cmpge.c | 18 ++ 2 files changed, 29 insertions(+) create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-cmpge.c diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md index f4248c55d4ec..0ce3ff6ef4a6 100644 --- a/gcc/config/s390/vx-builtins.md +++ b/gcc/config/s390/vx-builtins.md @@ -530,6 +530,17 @@ "vsel\t%v0,%1,%2,%3" [(set_attr "op_type" "VRR")]) +(define_insn "vsel_swapped" + [(set (match_operand:V_HW_FT 0 "register_operand" "=v") + (ior:V_HW_FT +(and:V_HW_FT (not:V_HW_FT (match_operand:V_HW_FT 3 "register_operand" "v")) + (match_operand:V_HW_FT 1 "register_operand" "v")) +(and:V_HW_FT (match_dup 3) + (match_operand:V_HW_FT 2 "register_operand" "v"] + "TARGET_VX" + "vsel\t%v0,%2,%1,%3" + [(set_attr "op_type" "VRR")]) + ; Vector sign extend to doubleword diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-cmpge.c b/gcc/testsuite/gcc.target/s390/vector/vec-cmpge.c new file mode 100644 index ..eb188690ae41 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vector/vec-cmpge.c @@ -0,0 +1,18 @@ +/* Check that vec_sel absorbs a negation generated by vec_cmpge. */ + +/* { dg-do compile } */ +/* { dg-options "-O3 -mzarch -march=z13" } */ + +typedef __attribute__((vector_size(16))) unsigned char uv16qi; + +#include + +void f(char *res, uv16qi ctrl) +{ + uv16qi a = vec_splat_u8(0xfe); + uv16qi b = vec_splat_u8(0x80); + uv16qi mask = vec_cmpge(ctrl, b); + *(uv16qi *)res = vec_sel(a, b, mask); +} + +/* { dg-final { scan-assembler-not "vno\t" } } */ -- 2.39.3
[PATCH] s390: Fix vec_init default expander
Do not reinitialize vector lanes to zero since they are already initialized to zero. Bootstrapped and regression tested on s390x. gcc/ChangeLog: * config/s390/s390.cc (vec_init): Fix default case gcc/Testsuite/ChangeLog: * gcc.target/s390/vector/vec-init-3.c: New test. Signed-off-by: Juergen Christ --- gcc/config/s390/s390.cc | 11 ++- .../gcc.target/s390/vector/vec-init-3.c | 17 + 2 files changed, 23 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-init-3.c diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 505de995da87..31b646782721 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -7130,11 +7130,12 @@ s390_expand_vec_init (rtx target, rtx vals) if (!general_operand (elem, GET_MODE (elem))) elem = force_reg (inner_mode, elem); - emit_insn (gen_rtx_SET (target, - gen_rtx_UNSPEC (mode, - gen_rtvec (3, elem, -GEN_INT (i), target), - UNSPEC_VEC_SET))); + if (elem != const0_rtx) + emit_insn (gen_rtx_SET (target, + gen_rtx_UNSPEC (mode, + gen_rtvec (3, elem, + GEN_INT (i), target), + UNSPEC_VEC_SET))); } } diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-init-3.c b/gcc/testsuite/gcc.target/s390/vector/vec-init-3.c new file mode 100644 index ..12008a963ffb --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vector/vec-init-3.c @@ -0,0 +1,17 @@ +/* Check that the default case of the vec_init expander does its job. */ + +/* { dg-do compile } */ +/* { dg-options "-O3 -mzarch -march=z13" } */ + +typedef __attribute__((vector_size(16))) signed int v4si; + +extern v4si G; + +v4si +n (signed int a) +{ + return G == (v4si){ a }; +} +/* { dg-final { scan-assembler-times "vzero" 1 } } */ +/* { dg-final { scan-assembler-times "vlvgf\t" 1 } } */ +/* { dg-final { scan-assembler-not "vleif\t" } } */ -- 2.39.3