Richard Biener <rguent...@suse.de> writes: > The following addresses the fact that with loop masking (or regular > mask loads) we do not implement load shortening but we override > the case where we need that for correctness. Likewise when we > attempt to use loop masking to handle large trailing gaps we cannot > do so when there's this overrun case. > > Bootstrapped and tested on x86_64-unknown-linux-gnu. I'm going to > wait for the arm/risc-v CI. > > Thanks, > Richard. > > PR tree-optimization/115895 > * tree-vect-stmts.cc (get_group_load_store_type): When we > might overrun because the group size is not a multiple of the > vector size we cannot use loop masking since that does not > implement the required load shortening. > > * gcc.target/i386/vect-pr115895.c: New testcase.
LGTM FWIW Richard > --- > gcc/testsuite/gcc.target/i386/vect-pr115895.c | 65 +++++++++++++++++++ > gcc/tree-vect-stmts.cc | 24 +++++-- > 2 files changed, 84 insertions(+), 5 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/vect-pr115895.c > > diff --git a/gcc/testsuite/gcc.target/i386/vect-pr115895.c > b/gcc/testsuite/gcc.target/i386/vect-pr115895.c > new file mode 100644 > index 00000000000..2246c66d37e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/vect-pr115895.c > @@ -0,0 +1,65 @@ > +/* For some targets we end up vectorizing the below loop such that the `sp` > + single integer is loaded into a 4 integer vector. > + While the writes are all safe, without 2 scalar loops being peeled into > the > + epilogue we would read past the end of the 31 integer array. This happens > + because we load a 4 integer chunk to only use the first integer and > + increment by 2 integers at a time, hence the last load needs s[30-33] and > + the penultimate load needs s[28-31]. > + This testcase ensures that we do not crash due to that behaviour. */ > +/* { dg-do run } */ > +/* { dg-options "-std=gnu17 -O2 -ftree-vectorize -fno-vect-cost-model > --param vect-partial-vector-usage=2 -mavx512bw -mprefer-vector-width=512" } */ > +/* { dg-require-effective-target mmap } */ > +#include <sys/mman.h> > +#include <stdio.h> > + > +#define MMAP_SIZE 0x20000 > +#define ADDRESS 0x1122000000 > + > +#define MB_BLOCK_SIZE 16 > +#define VERT_PRED_16 0 > +#define HOR_PRED_16 1 > +#define DC_PRED_16 2 > +int *sptr; > +extern void intrapred_luma_16x16(); > +unsigned short mprr_2[5][16][16]; > +void initialise_s(int *s) { } > +int main_1() { > + void *s_mapping; > + void *end_s; > + s_mapping = mmap ((void *)ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE, > + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); > + if (s_mapping == MAP_FAILED) > + { > + perror ("mmap"); > + return 1; > + } > + end_s = (s_mapping + MMAP_SIZE); > + sptr = (int*)(end_s - sizeof(int[31])); > + intrapred_luma_16x16(sptr); > + return 0; > +} > + > +void intrapred_luma_16x16(int * restrict sp) { > + for (int j=0; j < MB_BLOCK_SIZE; j++) > + { > + mprr_2[VERT_PRED_16][j][0]=sp[j*2]; > + mprr_2[VERT_PRED_16][j][1]=sp[j*2]; > + mprr_2[VERT_PRED_16][j][2]=sp[j*2]; > + mprr_2[VERT_PRED_16][j][3]=sp[j*2]; > + mprr_2[VERT_PRED_16][j][4]=sp[j*2]; > + mprr_2[VERT_PRED_16][j][5]=sp[j*2]; > + mprr_2[VERT_PRED_16][j][6]=sp[j*2]; > + mprr_2[VERT_PRED_16][j][7]=sp[j*2]; > + mprr_2[VERT_PRED_16][j][8]=sp[j*2]; > + mprr_2[VERT_PRED_16][j][9]=sp[j*2]; > + mprr_2[VERT_PRED_16][j][10]=sp[j*2]; > + mprr_2[VERT_PRED_16][j][11]=sp[j*2]; > + mprr_2[VERT_PRED_16][j][12]=sp[j*2]; > + mprr_2[VERT_PRED_16][j][13]=sp[j*2]; > + mprr_2[VERT_PRED_16][j][14]=sp[j*2]; > + mprr_2[VERT_PRED_16][j][15]=sp[j*2]; > + } > +} > + > +#define DO_TEST main_1 > +#include "avx512-check.h" > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index 0c0f999d3e3..b5dd1a2e40f 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -2216,13 +2216,14 @@ get_group_load_store_type (vec_info *vinfo, > stmt_vec_info stmt_info, > > If there is a combination of the access not covering the full > vector and a gap recorded then we may need to peel twice. */ > + bool large_vector_overrun_p = false; > if (loop_vinfo > && (*memory_access_type == VMAT_CONTIGUOUS > || *memory_access_type == VMAT_CONTIGUOUS_REVERSE) > && SLP_TREE_LOAD_PERMUTATION (slp_node).exists () > && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo), > nunits)) > - overrun_p = true; > + large_vector_overrun_p = overrun_p = true; > > /* If the gap splits the vector in half and the target > can do half-vector operations avoid the epilogue peeling > @@ -2273,7 +2274,8 @@ get_group_load_store_type (vec_info *vinfo, > stmt_vec_info stmt_info, > access and that is sufficiently small to be covered > by the single scalar iteration. */ > unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size; > - if (!nunits.is_constant (&cnunits) > + if (masked_p > + || !nunits.is_constant (&cnunits) > || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf) > || (((cremain = (group_size * cvf - gap) % cnunits), true) > && ((cpart_size = (1 << ceil_log2 (cremain))), true) > @@ -2282,9 +2284,11 @@ get_group_load_store_type (vec_info *vinfo, > stmt_vec_info stmt_info, > (vectype, cnunits / cpart_size, > &half_vtype) == NULL_TREE))) > { > - /* If all fails we can still resort to niter masking, so > - enforce the use of partial vectors. */ > - if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > + /* If all fails we can still resort to niter masking unless > + the vectors used are too big, so enforce the use of > + partial vectors. */ > + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) > + && !large_vector_overrun_p) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > @@ -2302,6 +2306,16 @@ get_group_load_store_type (vec_info *vinfo, > stmt_vec_info stmt_info, > return false; > } > } > + else if (large_vector_overrun_p) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "can't operate on partial vectors because " > + "only unmasked loads handle access " > + "shortening required because of gaps at " > + "the end of the access\n"); > + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > + } > } > } > }