https://gcc.gnu.org/g:68326d5d1a593dc0bf098c03aac25916168bc5a9
commit r15-6807-g68326d5d1a593dc0bf098c03aac25916168bc5a9 Author: Alex Coplan <alex.cop...@arm.com> Date: Mon Mar 11 13:09:10 2024 +0000 vect: Force alignment peeling to vectorize more early break loops [PR118211] This allows us to vectorize more loops with early exits by forcing peeling for alignment to make sure that we're guaranteed to be able to safely read an entire vector iteration without crossing a page boundary. To make this work for VLA architectures we have to allow compile-time non-constant target alignments. We also have to override the result of the target's preferred_vector_alignment hook if it isn't a power-of-two multiple of the TYPE_SIZE of the chosen vector type. gcc/ChangeLog: PR tree-optimization/118211 PR tree-optimization/116126 * tree-vect-data-refs.cc (vect_analyze_early_break_dependences): Set need_peeling_for_alignment flag on read DRs instead of failing vectorization. Punt on gathers. (dr_misalignment): Handle non-constant target alignments. (vect_compute_data_ref_alignment): If need_peeling_for_alignment flag is set on the DR, then override the target alignment chosen by the preferred_vector_alignment hook to choose a safe alignment. (vect_supportable_dr_alignment): Override support_vector_misalignment hook if need_peeling_for_alignment is set on the DR: in this case we must return dr_unaligned_unsupported in order to force peeling. * tree-vect-loop-manip.cc (vect_do_peeling): Allow prolog peeling by a compile-time non-constant amount. * tree-vectorizer.h (dr_vec_info): Add new flag need_peeling_for_alignment. gcc/testsuite/ChangeLog: PR tree-optimization/118211 PR tree-optimization/116126 * gcc.dg/tree-ssa/cunroll-13.c: Don't vectorize. * gcc.dg/tree-ssa/cunroll-14.c: Likewise. * gcc.dg/unroll-6.c: Likewise. * gcc.dg/tree-ssa/gen-vect-28.c: Likewise. * gcc.dg/vect/vect-104.c: Expect to vectorize. * gcc.dg/vect/vect-early-break_108-pr113588.c: Likewise. * gcc.dg/vect/vect-early-break_109-pr113588.c: Likewise. * gcc.dg/vect/vect-early-break_110-pr113467.c: Likewise. * gcc.dg/vect/vect-early-break_3.c: Likewise. * gcc.dg/vect/vect-early-break_65.c: Likewise. * gcc.dg/vect/vect-early-break_8.c: Likewise. * gfortran.dg/vect/vect-5.f90: Likewise. * gfortran.dg/vect/vect-8.f90: Likewise. * gcc.dg/vect/vect-switch-search-line-fast.c: Co-Authored-By: Tamar Christina <tamar.christ...@arm.com> Diff: --- gcc/testsuite/gcc.dg/tree-ssa/cunroll-13.c | 2 +- gcc/testsuite/gcc.dg/tree-ssa/cunroll-14.c | 2 +- gcc/testsuite/gcc.dg/tree-ssa/gen-vect-28.c | 1 + gcc/testsuite/gcc.dg/unroll-6.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-104.c | 1 + .../gcc.dg/vect/vect-early-break_108-pr113588.c | 2 +- .../gcc.dg/vect/vect-early-break_109-pr113588.c | 2 +- .../gcc.dg/vect/vect-early-break_110-pr113467.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-early-break_3.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-early-break_65.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-early-break_8.c | 2 +- .../gcc.dg/vect/vect-switch-search-line-fast.c | 3 +- gcc/testsuite/gfortran.dg/vect/vect-5.f90 | 1 + gcc/testsuite/gfortran.dg/vect/vect-8.f90 | 5 +- gcc/tree-vect-data-refs.cc | 113 ++++++++++++++++++--- gcc/tree-vect-loop-manip.cc | 6 -- gcc/tree-vectorizer.h | 5 + 17 files changed, 119 insertions(+), 34 deletions(-) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cunroll-13.c b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-13.c index 98cb56a8564b..154e2963f12d 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/cunroll-13.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-13.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O3 -fgimple -fdump-tree-cunroll-blocks-details" } */ +/* { dg-options "-O3 -fgimple -fdump-tree-cunroll-blocks-details -fno-tree-vectorize" } */ #if __SIZEOF_INT__ < 4 __extension__ typedef __INT32_TYPE__ i32; diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cunroll-14.c b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-14.c index 5f112da310c8..4b369f7ad278 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/cunroll-14.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-14.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O3 -fdump-tree-cunroll-blocks-details" } */ +/* { dg-options "-O3 -fdump-tree-cunroll-blocks-details -fno-tree-vectorize" } */ struct a {int a[100];}; void t(struct a *a) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-28.c b/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-28.c index c5f1b5aff115..5c0ea58a7b00 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-28.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-28.c @@ -20,6 +20,7 @@ int main_1 (int off) } /* check results: */ +#pragma GCC novector for (i = 0; i < N; i++) { if (ia[i+off] != 5) diff --git a/gcc/testsuite/gcc.dg/unroll-6.c b/gcc/testsuite/gcc.dg/unroll-6.c index 7664bbff109f..7be1b7cfadba 100644 --- a/gcc/testsuite/gcc.dg/unroll-6.c +++ b/gcc/testsuite/gcc.dg/unroll-6.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O3 -fdump-rtl-loop2_unroll-details-blocks -funroll-loops" } */ +/* { dg-options "-O3 -fdump-rtl-loop2_unroll-details-blocks -funroll-loops -fno-tree-vectorize" } */ /* { dg-require-effective-target int32plus } */ void abort (void); diff --git a/gcc/testsuite/gcc.dg/vect/vect-104.c b/gcc/testsuite/gcc.dg/vect/vect-104.c index 730efd39bd4a..8890a5da180b 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-104.c +++ b/gcc/testsuite/gcc.dg/vect/vect-104.c @@ -46,6 +46,7 @@ int main1 (int x) { #pragma GCC novector for (i = 0; i < N; i++) { +#pragma GCC novector for (j = 0; j < N; j++) { if (p->a[i][j] != c[i][j]) diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c index e488619c9aac..78b22f3b43b4 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c @@ -3,7 +3,7 @@ /* { dg-require-effective-target vect_early_break } */ /* { dg-require-effective-target vect_int } */ -/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ int foo (const char *s, unsigned long n) { diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c index 488c19d3ede8..2347fc26a14f 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c @@ -3,7 +3,7 @@ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target mmap } */ -/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ #include <sys/mman.h> #include <unistd.h> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c index 12d0ea1e871b..4f5a87c3ab94 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c @@ -2,7 +2,7 @@ /* { dg-require-effective-target vect_early_break } */ /* { dg-require-effective-target vect_long_long } */ -/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ #include "tree-vect.h" #include <stdint.h> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_3.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_3.c index 4afbc7266765..9d6cd0a191f6 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_3.c +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_3.c @@ -5,7 +5,7 @@ /* { dg-additional-options "-Ofast" } */ -/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ unsigned test4(char x, char *vect, int n) { diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_65.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_65.c index fa87999dcd4c..8763a5ff04ec 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_65.c +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_65.c @@ -17,4 +17,4 @@ void f() { return; } -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_8.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_8.c index 84e19423e2e6..541f439a9b49 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_8.c +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_8.c @@ -5,7 +5,7 @@ /* { dg-additional-options "-Ofast" } */ -/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ #include <complex.h> diff --git a/gcc/testsuite/gcc.dg/vect/vect-switch-search-line-fast.c b/gcc/testsuite/gcc.dg/vect/vect-switch-search-line-fast.c index 15f3a4ef38a7..02ad7a451ca2 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-switch-search-line-fast.c +++ b/gcc/testsuite/gcc.dg/vect/vect-switch-search-line-fast.c @@ -14,4 +14,5 @@ const unsigned char *search_line_fast2 (const unsigned char *s, return s; } -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ilp32 } } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" { target { ! ilp32 } } } } */ diff --git a/gcc/testsuite/gfortran.dg/vect/vect-5.f90 b/gcc/testsuite/gfortran.dg/vect/vect-5.f90 index b11cabaee23d..cca4875b859b 100644 --- a/gcc/testsuite/gfortran.dg/vect/vect-5.f90 +++ b/gcc/testsuite/gfortran.dg/vect/vect-5.f90 @@ -18,6 +18,7 @@ end do do I = 1, N +!GCC$ novector do J = I, M if (A(J,2) /= B(J)) then STOP 1 diff --git a/gcc/testsuite/gfortran.dg/vect/vect-8.f90 b/gcc/testsuite/gfortran.dg/vect/vect-8.f90 index 918eddee292f..d4ce44feb4b9 100644 --- a/gcc/testsuite/gfortran.dg/vect/vect-8.f90 +++ b/gcc/testsuite/gfortran.dg/vect/vect-8.f90 @@ -706,7 +706,6 @@ CALL track('KERNEL ') RETURN END SUBROUTINE kernel -! { dg-final { scan-tree-dump-times "vectorized 2\[56\] loops" 1 "vect" { target aarch64_sve } } } -! { dg-final { scan-tree-dump-times "vectorized 2\[45\] loops" 1 "vect" { target { aarch64*-*-* && { ! aarch64_sve } } } } } -! { dg-final { scan-tree-dump-times "vectorized 2\[3456\] loops" 1 "vect" { target { vect_intdouble_cvt && { ! aarch64*-*-* } } } } } +! { dg-final { scan-tree-dump-times "vectorized 2\[56\] loops" 1 "vect" { target aarch64*-*-* } } } +! { dg-final { scan-tree-dump-times "vectorized 2\[34567\] loops" 1 "vect" { target { vect_intdouble_cvt && { ! aarch64*-*-* } } } } } ! { dg-final { scan-tree-dump-times "vectorized 17 loops" 1 "vect" { target { { ! vect_intdouble_cvt } && { ! aarch64*-*-* } } } } } diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index c10508de5554..6eda40267bd1 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -34,6 +34,7 @@ along with GCC; see the file COPYING3. If not see #include "optabs-tree.h" #include "cgraph.h" #include "dumpfile.h" +#include "pretty-print.h" #include "alias.h" #include "fold-const.h" #include "stor-layout.h" @@ -750,15 +751,23 @@ vect_analyze_early_break_dependences (loop_vec_info loop_vinfo) if (DR_IS_READ (dr_ref) && !ref_within_array_bound (stmt, DR_REF (dr_ref))) { + if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo) + || STMT_VINFO_STRIDED_P (stmt_vinfo)) + { + const char *msg + = "early break not supported: cannot peel " + "for alignment, vectorization would read out of " + "bounds at %G"; + return opt_result::failure_at (stmt, msg, stmt); + } + + dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_vinfo); + dr_info->need_peeling_for_alignment = true; + if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "early breaks not supported: vectorization " - "would %s beyond size of obj.\n", - DR_IS_READ (dr_ref) ? "read" : "write"); - return opt_result::failure_at (stmt, - "can't safely apply code motion to " - "dependencies of %G to vectorize " - "the early exit.\n", stmt); + dump_printf_loc (MSG_NOTE, vect_location, + "marking DR (read) as needing peeling for " + "alignment at %G", stmt); } if (DR_IS_READ (dr_ref)) @@ -1241,11 +1250,15 @@ dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset) offset which can for example result from a negative stride access. */ poly_int64 misalignment = misalign + diff + offset; - /* vect_compute_data_ref_alignment will have ensured that target_alignment - is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */ - unsigned HOST_WIDE_INT target_alignment_c - = dr_info->target_alignment.to_constant (); - if (!known_misalignment (misalignment, target_alignment_c, &misalign)) + /* Below we reject compile-time non-constant target alignments, but if + our misalignment is zero, then we are known to already be aligned + w.r.t. any such possible target alignment. */ + if (known_eq (misalignment, 0)) + return 0; + + unsigned HOST_WIDE_INT target_alignment_c; + if (!dr_info->target_alignment.is_constant (&target_alignment_c) + || !known_misalignment (misalignment, target_alignment_c, &misalign)) return DR_MISALIGNMENT_UNKNOWN; return misalign; } @@ -1313,6 +1326,9 @@ vect_record_base_alignments (vec_info *vinfo) Compute the misalignment of the data reference DR_INFO when vectorizing with VECTYPE. + RESULT is non-NULL iff VINFO is a loop_vec_info. In that case, *RESULT will + be set appropriately on failure (but is otherwise left unchanged). + Output: 1. initialized misalignment info for DR_INFO @@ -1321,7 +1337,7 @@ vect_record_base_alignments (vec_info *vinfo) static void vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info, - tree vectype) + tree vectype, opt_result *result = nullptr) { stmt_vec_info stmt_info = dr_info->stmt; vec_base_alignments *base_alignments = &vinfo->base_alignments; @@ -1348,6 +1364,67 @@ vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info, poly_uint64 vector_alignment = exact_div (targetm.vectorize.preferred_vector_alignment (vectype), BITS_PER_UNIT); + + /* If this DR needs peeling for alignment for correctness, we must + ensure the target alignment is a constant power-of-two multiple of the + amount read per vector iteration (overriding the above hook where + necessary). */ + if (dr_info->need_peeling_for_alignment) + { + /* Vector size in bytes. */ + poly_uint64 safe_align = tree_to_poly_uint64 (TYPE_SIZE_UNIT (vectype)); + + /* We can only peel for loops, of course. */ + gcc_checking_assert (loop_vinfo); + + /* Calculate the number of vectors read per vector iteration. If + it is a power of two, multiply through to get the required + alignment in bytes. Otherwise, fail analysis since alignment + peeling wouldn't work in such a case. */ + poly_uint64 num_scalars = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) + num_scalars *= DR_GROUP_SIZE (stmt_info); + + auto num_vectors = vect_get_num_vectors (num_scalars, vectype); + if (!pow2p_hwi (num_vectors)) + { + *result = opt_result::failure_at (vect_location, + "non-power-of-two num vectors %u " + "for DR needing peeling for " + "alignment at %G", + num_vectors, stmt_info->stmt); + return; + } + + safe_align *= num_vectors; + if (maybe_gt (safe_align, 4096U)) + { + pretty_printer pp; + pp_wide_integer (&pp, safe_align); + *result = opt_result::failure_at (vect_location, + "alignment required for correctness" + " (%s) may exceed page size", + pp_formatted_text (&pp)); + return; + } + + unsigned HOST_WIDE_INT multiple; + if (!constant_multiple_p (vector_alignment, safe_align, &multiple) + || !pow2p_hwi (multiple)) + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "forcing alignment for DR from preferred ("); + dump_dec (MSG_NOTE, vector_alignment); + dump_printf (MSG_NOTE, ") to safe align ("); + dump_dec (MSG_NOTE, safe_align); + dump_printf (MSG_NOTE, ") for stmt: %G", stmt_info->stmt); + } + vector_alignment = safe_align; + } + } + SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment); /* If the main loop has peeled for alignment we have no way of knowing @@ -2865,8 +2942,12 @@ vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo) if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt) && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt) continue; + opt_result res = opt_result::success (); vect_compute_data_ref_alignment (loop_vinfo, dr_info, - STMT_VINFO_VECTYPE (dr_info->stmt)); + STMT_VINFO_VECTYPE (dr_info->stmt), + &res); + if (!res) + return res; } } @@ -7130,6 +7211,8 @@ vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info, if (misalignment == 0) return dr_aligned; + else if (dr_info->need_peeling_for_alignment) + return dr_unaligned_unsupported; /* For now assume all conditional loads/stores support unaligned access without any special code. */ diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index 5d1b70aea43c..15cac0fe27df 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -3128,12 +3128,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, int estimated_vf; int prolog_peeling = 0; bool vect_epilogues = loop_vinfo->epilogue_vinfo != NULL; - /* We currently do not support prolog peeling if the target alignment is not - known at compile time. 'vect_gen_prolog_loop_niters' depends on the - target alignment being constant. */ - dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); - if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ()) - return NULL; if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 135eb119ca2e..79db02a39a8f 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1278,6 +1278,11 @@ public: poly_uint64 target_alignment; /* If true the alignment of base_decl needs to be increased. */ bool base_misaligned; + + /* Set by early break vectorization when this DR needs peeling for alignment + for correctness. */ + bool need_peeling_for_alignment; + tree base_decl; /* Stores current vectorized loop's offset. To be added to the DR's