The first patch is a backport of Prathamesh's aarch64 fix. The other three were approved for branches during the initial submission.
Tested on aarch64-linux-gnu, armeb-eabi and x86_64-linux-gnu. pushed. Richard
>From 03d683ec52cd77fa16effdc7d2c5dff7d49249d9 Mon Sep 17 00:00:00 2001 From: Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> Date: Wed, 21 Aug 2019 18:34:43 +0000 Subject: [PATCH 1/4] re PR target/90724 (ICE with __sync_bool_compare_and_swap with -march=armv8.2-a+sve) 2020-02-18 Richard Sandiford <richard.sandif...@arm.com> gcc/ Backport from mainline 2019-08-21 Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> PR target/90724 * config/aarch64/aarch64.c (aarch64_gen_compare_reg_maybe_ze): Force y in reg if it fails aarch64_plus_operand predicate. --- gcc/config/aarch64/aarch64.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 26a28570be9..b452a53af99 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1910,6 +1910,9 @@ aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y, } } + if (!aarch64_plus_operand (y, y_mode)) + y = force_reg (y_mode, y); + return aarch64_gen_compare_reg (code, x, y); } -- 2.17.1
>From a8222bb635ed70c19992a352987880174aee1701 Mon Sep 17 00:00:00 2001 From: Richard Sandiford <richard.sandif...@arm.com> Date: Mon, 11 Nov 2019 19:43:52 +0000 Subject: [PATCH 2/4] Fix SLP downward group access classification [PR92420] This PR was caused by the SLP handling in get_group_load_store_type returning VMAT_CONTIGUOUS rather than VMAT_CONTIGUOUS_REVERSE for downward groups. A more elaborate fix would be to try to combine the reverse permutation into SLP_TREE_LOAD_PERMUTATION for loads, but that's really a follow-on optimisation and not backport material. It might also not necessarily be a win, if the target supports (say) reversing and odd/even swaps as independent permutes but doesn't recognise the combined form. 2020-02-18 Richard Sandiford <richard.sandif...@arm.com> gcc/ Backport from mainline 2019-11-11 Richard Sandiford <richard.sandif...@arm.com> PR tree-optimization/92420 * tree-vect-stmts.c (get_negative_load_store_type): Move further up file. (get_group_load_store_type): Use it for reversed SLP accesses. gcc/testsuite/ PR tree-optimization/92420 * gcc.dg/vect/pr92420.c: New test. --- gcc/testsuite/gcc.dg/vect/pr92420.c | 48 ++++++++++++ gcc/tree-vect-stmts.c | 110 +++++++++++++++------------- 2 files changed, 107 insertions(+), 51 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/pr92420.c diff --git a/gcc/testsuite/gcc.dg/vect/pr92420.c b/gcc/testsuite/gcc.dg/vect/pr92420.c new file mode 100644 index 00000000000..e43539fbbd7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr92420.c @@ -0,0 +1,48 @@ +/* { dg-additional-options "-mavx2" { target avx_runtime } } */ + +#include "tree-vect.h" + +#define N 16 +struct C { int r, i; }; +struct C a[N], b[N], c[N], d[N], e[N]; + +__attribute__((noipa)) static void +foo (struct C *__restrict x, struct C *__restrict y, struct C *__restrict z, int w) +{ + int i; + for (int i = 0; i < w; i++) + { + z[i].r = x[i].r * y[-1 - i].r - x[i].i * y[-1 - i].i; + z[i].i = x[i].i * y[-1 - i].r + x[i].r * y[-1 - i].i; + } +} + +__attribute__((noipa)) static void +bar (struct C *__restrict x, struct C *__restrict y, struct C *__restrict z, int w) +{ + int i; + for (int i = 0; i < w; i++) + { + z[i].r = x[i].r * y[i].r - x[i].i * y[i].i; + z[i].i = x[i].i * y[i].r + x[i].r * y[i].i; + } +} + +int +main () +{ + check_vect (); + int i; + for (i = 0; i < N; ++i) + { + a[i].r = N - i; a[i].i = i - N; + b[i].r = i - N; b[i].i = i + N; + c[i].r = -1 - i; c[i].i = 2 * N - 1 - i; + } + foo (a, b + N, d, N); + bar (a, c, e, N); + for (i = 0; i < N; ++i) + if (d[i].r != e[i].r || d[i].i != e[i].i) + __builtin_abort (); + return 0; +} diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index baabdb03b5d..8fd7af19817 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -2166,6 +2166,56 @@ perm_mask_for_reverse (tree vectype) return vect_gen_perm_mask_checked (vectype, indices); } +/* A subroutine of get_load_store_type, with a subset of the same + arguments. Handle the case where STMT_INFO is a load or store that + accesses consecutive elements with a negative step. */ + +static vect_memory_access_type +get_negative_load_store_type (stmt_vec_info stmt_info, tree vectype, + vec_load_store_type vls_type, + unsigned int ncopies) +{ + dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); + dr_alignment_support alignment_support_scheme; + + if (ncopies > 1) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "multiple types with negative step.\n"); + return VMAT_ELEMENTWISE; + } + + alignment_support_scheme = vect_supportable_dr_alignment (dr_info, false); + if (alignment_support_scheme != dr_aligned + && alignment_support_scheme != dr_unaligned_supported) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "negative step but alignment required.\n"); + return VMAT_ELEMENTWISE; + } + + if (vls_type == VLS_STORE_INVARIANT) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "negative step with invariant source;" + " no permute needed.\n"); + return VMAT_CONTIGUOUS_DOWN; + } + + if (!perm_mask_for_reverse (vectype)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "negative step and reversing not supported.\n"); + return VMAT_ELEMENTWISE; + } + + return VMAT_CONTIGUOUS_REVERSE; +} + /* STMT_INFO is either a masked or unconditional store. Return the value being stored. */ @@ -2268,7 +2318,15 @@ get_group_load_store_type (stmt_vec_info stmt_info, tree vectype, bool slp, "Peeling for outer loop is not supported\n"); return false; } - *memory_access_type = VMAT_CONTIGUOUS; + int cmp = compare_step_with_zero (stmt_info); + if (cmp < 0) + *memory_access_type = get_negative_load_store_type + (stmt_info, vectype, vls_type, 1); + else + { + gcc_assert (!loop_vinfo || cmp > 0); + *memory_access_type = VMAT_CONTIGUOUS; + } } } else @@ -2370,56 +2428,6 @@ get_group_load_store_type (stmt_vec_info stmt_info, tree vectype, bool slp, return true; } -/* A subroutine of get_load_store_type, with a subset of the same - arguments. Handle the case where STMT_INFO is a load or store that - accesses consecutive elements with a negative step. */ - -static vect_memory_access_type -get_negative_load_store_type (stmt_vec_info stmt_info, tree vectype, - vec_load_store_type vls_type, - unsigned int ncopies) -{ - dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); - dr_alignment_support alignment_support_scheme; - - if (ncopies > 1) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "multiple types with negative step.\n"); - return VMAT_ELEMENTWISE; - } - - alignment_support_scheme = vect_supportable_dr_alignment (dr_info, false); - if (alignment_support_scheme != dr_aligned - && alignment_support_scheme != dr_unaligned_supported) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "negative step but alignment required.\n"); - return VMAT_ELEMENTWISE; - } - - if (vls_type == VLS_STORE_INVARIANT) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "negative step with invariant source;" - " no permute needed.\n"); - return VMAT_CONTIGUOUS_DOWN; - } - - if (!perm_mask_for_reverse (vectype)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "negative step and reversing not supported.\n"); - return VMAT_ELEMENTWISE; - } - - return VMAT_CONTIGUOUS_REVERSE; -} - /* Analyze load or store statement STMT_INFO of type VLS_TYPE. Return true if there is a memory access type that the vectorized form can use, storing it in *MEMORY_ACCESS_TYPE if so. If we decide to use gathers -- 2.17.1
>From 7ac680f5867e0390f62157dae50ecd358ada2234 Mon Sep 17 00:00:00 2001 From: Richard Sandiford <richard.sandif...@arm.com> Date: Fri, 29 Nov 2019 13:04:56 +0000 Subject: [PATCH 3/4] Don't pass booleans as mask types to simd clones [PR92710] In this PR we assigned a vector mask type to the result of a comparison and then tried to pass that mask type to a simd clone, which expected a normal (non-mask) type instead. This patch simply punts on call arguments that have a mask type. A better fix would be to pattern-match the comparison to a COND_EXPR, like we would if the comparison was stored to memory, but doing that isn't gcc 9 or 10 material. Note that this doesn't affect x86_64-linux-gnu because the ABI promotes bool arguments to ints. 2020-02-18 Richard Sandiford <richard.sandif...@arm.com> gcc/ Backport from mainline 2019-11-29 Richard Sandiford <richard.sandif...@arm.com> PR tree-optimization/92710 * tree-vect-stmts.c (vectorizable_simd_clone_call): Reject vector mask arguments. gcc/testsuite/ PR tree-optimization/92710 * gcc.dg/vect/pr92710.c: New test. --- gcc/testsuite/gcc.dg/vect/pr92710.c | 12 ++++++++++++ gcc/tree-vect-stmts.c | 11 ++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.dg/vect/pr92710.c diff --git a/gcc/testsuite/gcc.dg/vect/pr92710.c b/gcc/testsuite/gcc.dg/vect/pr92710.c new file mode 100644 index 00000000000..2986d4ce06a --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr92710.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-fopenmp-simd" } */ + +#pragma omp declare simd +_Bool foo (_Bool) __attribute__((const)); + +void +f (_Bool *restrict x, char *restrict y, char *restrict z) +{ + for (int i = 0; i < 128; ++i) + x[i] = foo (y[i] == z[i]); +} diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 8fd7af19817..507f81b0a0e 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -3915,7 +3915,16 @@ vectorizable_simd_clone_call (stmt_vec_info stmt_info, || thisarginfo.dt == vect_external_def) gcc_assert (thisarginfo.vectype == NULL_TREE); else - gcc_assert (thisarginfo.vectype != NULL_TREE); + { + gcc_assert (thisarginfo.vectype != NULL_TREE); + if (VECTOR_BOOLEAN_TYPE_P (thisarginfo.vectype)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "vector mask arguments are not supported\n"); + return false; + } + } /* For linear arguments, the analyze phase should have saved the base and step in STMT_VINFO_SIMD_CLONE_INFO. */ -- 2.17.1
>From da055b66ff0309c41b5f4d7db12e42e27123373b Mon Sep 17 00:00:00 2001 From: Richard Sandiford <richard.sandif...@arm.com> Date: Mon, 27 Jan 2020 19:37:55 +0000 Subject: [PATCH 4/4] predcom: Fix invalid store-store commoning [PR93434] predcom has the following code to stop one rogue load from interfering with other store-load opportunities: /* If A is read and B write or vice versa and there is unsuitable dependence, instead of merging both components into a component that will certainly not pass suitable_component_p, just put the read into bad component, perhaps at least the write together with all the other data refs in it's component will be optimizable. */ But when store-store commoning was added later, this had the effect of ignoring loads that occur between two candidate stores. There is code further up to handle loads and stores with unknown dependences: /* Don't do store elimination if there is any unknown dependence for any store data reference. */ if ((DR_IS_WRITE (dra) || DR_IS_WRITE (drb)) && (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know || DDR_NUM_DIST_VECTS (ddr) == 0)) eliminate_store_p = false; But the store-load code above skips loads for *known* dependences if (a) the load has already been marked "bad" or (b) the data-ref machinery knows the dependence distance, but determine_offsets can't handle the combination. (a) happens to be the problem in the testcase, but a different sequence could have given (b) instead. We have writes to individual fields of a structure and reads from the whole structure. Since determine_offsets requires the types to be the same, it returns false for each such read/write combination. This patch records which components have had loads removed and prevents store-store commoning for them. It's a bit too pessimistic, since there shouldn't be a problem if a "bad" load dominates all stores in a component. But (a) we can't AFAIK use pcom_stmt_dominates_stmt_p here and (b) the handling for that case would probably need to be removed again if we handled more exotic cases in future. 2020-02-18 Richard Sandiford <richard.sandif...@arm.com> gcc/ Backport from mainline 2020-01-28 Richard Sandiford <richard.sandif...@arm.com> PR tree-optimization/93434 * tree-predcom.c (split_data_refs_to_components): Record which components have had aliasing loads removed. Prevent store-store commoning for all such components. gcc/testsuite/ PR tree-optimization/93434 * gcc.c-torture/execute/pr93434.c: New test. --- gcc/testsuite/gcc.c-torture/execute/pr93434.c | 36 +++++++++++++++++++ gcc/tree-predcom.c | 24 +++++++++++-- 2 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.c-torture/execute/pr93434.c diff --git a/gcc/testsuite/gcc.c-torture/execute/pr93434.c b/gcc/testsuite/gcc.c-torture/execute/pr93434.c new file mode 100644 index 00000000000..e786252794b --- /dev/null +++ b/gcc/testsuite/gcc.c-torture/execute/pr93434.c @@ -0,0 +1,36 @@ +typedef struct creal_T { + double re; + double im; +} creal_T; + +#define N 16 +int main() { + int k; + int i; + int j; + creal_T t2[N]; + double inval; + + inval = 1.0; + for (j = 0; j < N; ++j) { + t2[j].re = 0; + t2[j].im = 0; + } + + for (j = 0; j < N/4; j++) { + i = j * 4; + t2[i].re = inval; + t2[i].im = inval; + k = i + 3; + t2[k].re = inval; + t2[k].im = inval; + t2[i] = t2[k]; + t2[k].re = inval; + } + + for (i = 0; i < 2; ++i) + if (t2[i].re != !i || t2[i].im != !i) + __builtin_abort (); + + return 0; +} diff --git a/gcc/tree-predcom.c b/gcc/tree-predcom.c index 8e83a715a24..dd66cdc12ca 100644 --- a/gcc/tree-predcom.c +++ b/gcc/tree-predcom.c @@ -767,6 +767,7 @@ split_data_refs_to_components (struct loop *loop, /* Don't do store elimination if loop has multiple exit edges. */ bool eliminate_store_p = single_exit (loop) != NULL; basic_block last_always_executed = last_always_executed_block (loop); + auto_bitmap no_store_store_comps; FOR_EACH_VEC_ELT (datarefs, i, dr) { @@ -838,9 +839,13 @@ split_data_refs_to_components (struct loop *loop, else if (DR_IS_READ (dra) && ib != bad) { if (ia == bad) - continue; + { + bitmap_set_bit (no_store_store_comps, ib); + continue; + } else if (!determine_offset (dra, drb, &dummy_off)) { + bitmap_set_bit (no_store_store_comps, ib); merge_comps (comp_father, comp_size, bad, ia); continue; } @@ -848,9 +853,13 @@ split_data_refs_to_components (struct loop *loop, else if (DR_IS_READ (drb) && ia != bad) { if (ib == bad) - continue; + { + bitmap_set_bit (no_store_store_comps, ia); + continue; + } else if (!determine_offset (dra, drb, &dummy_off)) { + bitmap_set_bit (no_store_store_comps, ia); merge_comps (comp_father, comp_size, bad, ib); continue; } @@ -908,6 +917,17 @@ split_data_refs_to_components (struct loop *loop, comp->refs.quick_push (dataref); } + if (eliminate_store_p) + { + bitmap_iterator bi; + EXECUTE_IF_SET_IN_BITMAP (no_store_store_comps, 0, ia, bi) + { + ca = component_of (comp_father, ia); + if (ca != bad) + comps[ca]->eliminate_store_p = false; + } + } + for (i = 0; i < n; i++) { comp = comps[i]; -- 2.17.1