On Fri, Jul 20, 2018 at 12:57 PM Richard Sandiford <richard.sandif...@arm.com> wrote: > > We could vectorise: > > for (...) > { > a[0] = ...; > a[1] = ...; > a[2] = ...; > a[3] = ...; > a += stride; > } > > (including the case when stride == 8) but not: > > for (...) > { > a[0] = ...; > a[1] = ...; > a[2] = ...; > a[3] = ...; > a += 8; > } > > (where the stride is always 8). The former was treated as a "grouped > and strided" store, while the latter was treated as grouped store with > gaps, which we don't support. > > This patch makes us treat groups of stores with gaps at the end as > strided groups too. I tried to go through all uses of STMT_VINFO_STRIDED_P > and all vector uses of DR_STEP to see whether there were any hard-baked > assumptions, but couldn't see any. I wondered whether we should relax: > > /* We do not have to consider dependences between accesses that belong > to the same group, unless the stride could be smaller than the > group size. */ > if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a) > && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a) > == DR_GROUP_FIRST_ELEMENT (stmtinfo_b)) > && !STMT_VINFO_STRIDED_P (stmtinfo_a)) > return false; > > for cases in which the step is constant and the absolute step is known > to be greater than the group size, but data dependence analysis should > already return chrec_known for those cases. > > The new test is a version of vect-avg-15.c with the variable step > replaced by a constant one. > > A natural follow-on would be to do the same for groups with gaps in > the middle: > > /* Check that the distance between two accesses is equal to the type > size. Otherwise, we have gaps. */ > diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) > - TREE_INT_CST_LOW (prev_init)) / type_size; > if (diff != 1) > { > [...] > if (DR_IS_WRITE (data_ref)) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "interleaved store with gaps\n"); > return false; > } > > But I think we should do that separately and see what the fallout > from this change is first.
Agreed. > Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf > and x86_64-linux-gnu. OK to install? Do you need to set STMT_VINFO_STRIDED_P on all stmts? I think it is enough to set it on the first group element. OK otherwise. Thanks, Richard. > Richard > > > 2018-07-20 Richard Sandiford <richard.sandif...@arm.com> > > gcc/ > * tree-vect-data-refs.c (vect_analyze_group_access_1): Convert > grouped stores with gaps to a strided group. > > gcc/testsuite/ > * gcc.dg/vect/vect-avg-16.c: New test. > * gcc.dg/vect/slp-37.c: Expect the loop to be vectorized. > * gcc.dg/vect/vect-strided-u8-i8-gap4.c, > * gcc.dg/vect/vect-strided-u8-i8-gap4-big-array.c: Likewise for > the second loop in main1. > > Index: gcc/tree-vect-data-refs.c > =================================================================== > --- gcc/tree-vect-data-refs.c 2018-06-30 13:44:38.567611988 +0100 > +++ gcc/tree-vect-data-refs.c 2018-07-20 11:55:22.570911497 +0100 > @@ -2632,10 +2632,14 @@ vect_analyze_group_access_1 (struct data > if (groupsize != count > && !DR_IS_READ (dr)) > { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "interleaved store with gaps\n"); > - return false; > + groupsize = count; > + STMT_VINFO_STRIDED_P (stmt_info) = true; > + stmt_vec_info next_info = stmt_info; > + while (DR_GROUP_NEXT_ELEMENT (next_info)) > + { > + next_info = vinfo_for_stmt (DR_GROUP_NEXT_ELEMENT (next_info)); > + STMT_VINFO_STRIDED_P (next_info) = 1; > + } > } > > /* If there is a gap after the last load in the group it is the > @@ -2651,6 +2655,8 @@ vect_analyze_group_access_1 (struct data > "Detected interleaving "); > if (DR_IS_READ (dr)) > dump_printf (MSG_NOTE, "load "); > + else if (STMT_VINFO_STRIDED_P (stmt_info)) > + dump_printf (MSG_NOTE, "strided store "); > else > dump_printf (MSG_NOTE, "store "); > dump_printf (MSG_NOTE, "of size %u starting with ", > Index: gcc/testsuite/gcc.dg/vect/vect-avg-16.c > =================================================================== > --- /dev/null 2018-07-09 14:52:09.234750850 +0100 > +++ gcc/testsuite/gcc.dg/vect/vect-avg-16.c 2018-07-20 11:55:22.570911497 > +0100 > @@ -0,0 +1,52 @@ > +/* { dg-additional-options "-O3" } */ > +/* { dg-require-effective-target vect_int } */ > + > +#include "tree-vect.h" > + > +#define N 80 > + > +void __attribute__ ((noipa)) > +f (signed char *restrict a, signed char *restrict b, > + signed char *restrict c, int n) > +{ > + for (int j = 0; j < n; ++j) > + { > + for (int i = 0; i < 16; ++i) > + a[i] = (b[i] + c[i]) >> 1; > + a += 20; > + b += 20; > + c += 20; > + } > +} > + > +#define BASE1 -126 > +#define BASE2 -42 > + > +signed char a[N], b[N], c[N]; > + > +int > +main (void) > +{ > + check_vect (); > + > + for (int i = 0; i < N; ++i) > + { > + a[i] = i; > + b[i] = BASE1 + i * 3; > + c[i] = BASE2 + i * 2; > + asm volatile ("" ::: "memory"); > + } > + f (a, b, c, N / 20); > + for (int i = 0; i < N; ++i) > + { > + int d = (BASE1 + BASE2 + i * 5) >> 1; > + if (a[i] != (i % 20 < 16 ? d : i)) > + __builtin_abort (); > + } > + return 0; > +} > + > +/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" > } } */ > +/* { dg-final { scan-tree-dump {\.AVG_FLOOR} "vect" { target vect_avg_qi } } > } */ > +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" { > target vect_avg_qi } } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target > vect_avg_qi } } } */ > Index: gcc/testsuite/gcc.dg/vect/slp-37.c > =================================================================== > --- gcc/testsuite/gcc.dg/vect/slp-37.c 2018-05-02 08:37:49.033604262 +0100 > +++ gcc/testsuite/gcc.dg/vect/slp-37.c 2018-07-20 11:55:22.570911497 +0100 > @@ -17,8 +17,8 @@ foo1 (s1 *arr) > int i; > s1 *ptr = arr; > > - /* Different constant types - not SLPable. The group size is not power of > 2, > - interleaving is not supported either. */ > + /* Vectorized as a strided SLP pair of accesses to <a, b> and a single > + strided access to c. */ > for (i = 0; i < N; i++) > { > ptr->a = 6; > @@ -58,6 +58,5 @@ int main (void) > return 0; > } > > -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */ > -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" > } } */ > - > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" > } } */ > Index: gcc/testsuite/gcc.dg/vect/vect-strided-u8-i8-gap4.c > =================================================================== > --- gcc/testsuite/gcc.dg/vect/vect-strided-u8-i8-gap4.c 2018-05-02 > 08:37:49.021604375 +0100 > +++ gcc/testsuite/gcc.dg/vect/vect-strided-u8-i8-gap4.c 2018-07-20 > 11:55:22.570911497 +0100 > @@ -53,7 +53,7 @@ main1 (s *arr) > } > > ptr = arr; > - /* Not vectorizable: gap in store. */ > + /* Vectorized as a strided SLP pair. */ > for (i = 0; i < N; i++) > { > res[i].a = ptr->b; > @@ -97,5 +97,4 @@ int main (void) > return 0; > } > > -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target > vect_strided8 } } } */ > - > +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target > vect_strided8 } } } */ > Index: gcc/testsuite/gcc.dg/vect/vect-strided-u8-i8-gap4-big-array.c > =================================================================== > --- gcc/testsuite/gcc.dg/vect/vect-strided-u8-i8-gap4-big-array.c > 2018-05-02 08:37:49.013604450 +0100 > +++ gcc/testsuite/gcc.dg/vect/vect-strided-u8-i8-gap4-big-array.c > 2018-07-20 11:55:22.570911497 +0100 > @@ -55,7 +55,7 @@ main1 (s *arr) > } > > ptr = arr; > - /* Not vectorizable: gap in store. */ > + /* Vectorized as a strided SLP pair. */ > for (i = 0; i < N; i++) > { > res[i].a = ptr->b; > @@ -110,5 +110,4 @@ int main (void) > return 0; > } > > -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target > vect_strided8 } } } */ > - > +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target > vect_strided8 } } } */