Hi! The following testcase which I wrote in order to test ncopies > 1 handling of the inclusive scan vectorization reveals we don't vectorize that, because we required MULT_EXPR on the DR_OFFSET, but obviously for 1 byte elements there is none.
Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, committed to trunk. 2019-06-21 Jakub Jelinek <ja...@redhat.com> * tree-vect-data-refs.c (vect_find_stmt_data_reference): Handle "omp simd array" arrays with one byte elements. * gcc.dg/vect/vect-simd-11.c: New test. * gcc.target/i386/sse2-vect-simd-11.c: New test. * gcc.target/i386/avx2-vect-simd-11.c: New test. * gcc.target/i386/avx512bw-vect-simd-11.c: New test. --- gcc/tree-vect-data-refs.c.jj 2019-06-20 13:26:29.071150988 +0200 +++ gcc/tree-vect-data-refs.c 2019-06-20 13:55:35.421150589 +0200 @@ -4075,14 +4075,17 @@ vect_find_stmt_data_reference (loop_p lo && integer_zerop (DR_STEP (newdr))) { tree off = DR_OFFSET (newdr); + tree step = ssize_int (1); STRIP_NOPS (off); - if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST - && TREE_CODE (off) == MULT_EXPR + if (TREE_CODE (off) == MULT_EXPR && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) { - tree step = TREE_OPERAND (off, 1); + step = TREE_OPERAND (off, 1); off = TREE_OPERAND (off, 0); STRIP_NOPS (off); + } + if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST) + { if (CONVERT_EXPR_P (off) && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0))) < TYPE_PRECISION (TREE_TYPE (off)))) --- gcc/testsuite/gcc.dg/vect/vect-simd-11.c.jj 2019-06-20 13:49:16.322081280 +0200 +++ gcc/testsuite/gcc.dg/vect/vect-simd-11.c 2019-06-20 12:58:52.516069619 +0200 @@ -0,0 +1,186 @@ +/* { dg-require-effective-target size32plus } */ +/* { dg-additional-options "-fopenmp-simd" } */ +/* { dg-additional-options "-mavx" { target avx_runtime } } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */ + +#ifndef main +#include "tree-vect.h" +#endif + +int r, a[1024], b[1024]; +unsigned short r2, b2[1024]; +unsigned char r3, b3[1024]; + +__attribute__((noipa)) void +foo (int *a, int *b, unsigned short *b2, unsigned char *b3) +{ + #pragma omp simd reduction (inscan, +:r, r2, r3) + for (int i = 0; i < 1024; i++) + { + { r += a[i]; r2 += a[i]; r3 += a[i]; } + #pragma omp scan inclusive(r, r2, r3) + { + b[i] = r; + b2[i] = r2; + b3[i] = r3; + } + } +} + +__attribute__((noipa)) int +bar (unsigned short *s2p, unsigned char *s3p) +{ + int s = 0; + unsigned short s2 = 0; + unsigned char s3 = 0; + #pragma omp simd reduction (inscan, +:s, s2, s3) + for (int i = 0; i < 1024; i++) + { + { + s += 2 * a[i]; + s2 += 2 * a[i]; + s3 += 2 * a[i]; + } + #pragma omp scan inclusive(s, s2, s3) + { b[i] = s; b2[i] = s2; b3[i] = s3; } + } + *s2p = s2; + *s3p = s3; + return s; +} + +__attribute__((noipa)) void +baz (int *a, int *b, unsigned short *b2, unsigned char *b3) +{ + #pragma omp simd reduction (inscan, +:r, r2, r3) if (simd: 0) + for (int i = 0; i < 1024; i++) + { + { + r += a[i]; + r2 += a[i]; + r3 += a[i]; + } + #pragma omp scan inclusive(r, r2, r3) + { + b[i] = r; + b2[i] = r2; + b3[i] = r3; + } + } +} + +__attribute__((noipa)) int +qux (unsigned short *s2p, unsigned char *s3p) +{ + int s = 0; + unsigned short s2 = 0; + unsigned char s3 = 0; + #pragma omp simd reduction (inscan, +:s, s2, s3) simdlen (1) + for (int i = 0; i < 1024; i++) + { + { s += 2 * a[i]; s2 += 2 * a[i]; s3 += 2 * a[i]; } + #pragma omp scan inclusive(s, s2, s3) + { b[i] = s; b2[i] = s2; b3[i] = s3; } + } + *s2p = s2; + *s3p = s3; + return s; +} + +int +main () +{ + int s = 0; + unsigned short s2; + unsigned char s3; +#ifndef main + check_vect (); +#endif + for (int i = 0; i < 1024; ++i) + { + a[i] = i; + b[i] = -1; + b2[i] = -1; + b3[i] = -1; + asm ("" : "+g" (i)); + } + foo (a, b, b2, b3); + if (r != 1024 * 1023 / 2 + || r2 != (unsigned short) r + || r3 != (unsigned char) r) + abort (); + for (int i = 0; i < 1024; ++i) + { + s += i; + if (b[i] != s + || b2[i] != (unsigned short) s + || b3[i] != (unsigned char) s) + abort (); + else + { + b[i] = 25; + b2[i] = 24; + b3[i] = 26; + } + } + if (bar (&s2, &s3) != 1024 * 1023) + abort (); + if (s2 != (unsigned short) (1024 * 1023) + || s3 != (unsigned char) (1024 * 1023)) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + s += 2 * i; + if (b[i] != s + || b2[i] != (unsigned short) s + || b3[i] != (unsigned char) s) + abort (); + else + { + b[i] = -1; + b2[i] = -1; + b3[i] = -1; + } + } + r = 0; + r2 = 0; + r3 = 0; + baz (a, b, b2, b3); + if (r != 1024 * 1023 / 2 + || r2 != (unsigned short) r + || r3 != (unsigned char) r) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + s += i; + if (b[i] != s + || b2[i] != (unsigned short) s + || b3[i] != (unsigned char) s) + abort (); + else + { + b[i] = 25; + b2[i] = 24; + b3[i] = 26; + } + } + s2 = 0; + s3 = 0; + if (qux (&s2, &s3) != 1024 * 1023) + abort (); + if (s2 != (unsigned short) (1024 * 1023) + || s3 != (unsigned char) (1024 * 1023)) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + s += 2 * i; + if (b[i] != s + || b2[i] != (unsigned short) s + || b3[i] != (unsigned char) s) + abort (); + } + return 0; +} --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-11.c.jj 2019-06-20 13:50:04.160330704 +0200 +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-11.c 2019-06-20 13:50:12.607198170 +0200 @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target sse2 } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "sse2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-11.c" + +static void +sse2_test (void) +{ + do_main (); +} --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-11.c.jj 2019-06-20 13:50:27.008972212 +0200 +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-11.c 2019-06-20 13:50:37.264811298 +0200 @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx2 } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-11.c" + +static void +avx2_test (void) +{ + do_main (); +} --- gcc/testsuite/gcc.target/i386/avx512bw-vect-simd-11.c.jj 2019-06-20 13:52:33.859983753 +0200 +++ gcc/testsuite/gcc.target/i386/avx512bw-vect-simd-11.c 2019-06-20 13:52:49.293742912 +0200 @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx512bw -mprefer-vector-width=512 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx512bw } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx512bw-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-11.c" + +static void +avx512bw_test (void) +{ + do_main (); +} Jakub