On Tue, Oct 06, 2020 at 08:22:13AM +0200, Richard Biener wrote: > > I was really hoping bbs 4 and 5 would be one loop (the one I set safelen > > and force_vectorize etc. for) and that basic blocks 6 and 7 would be > > together with that inner loop another loop, but apparently loop discovery > > thinks it is just one loop. > > Any ideas what I'm doing wrong or is there any way how to make it two loops > > (that would also survive all the cfg cleanups until vectorization)? > > The early CFG looks like we have a common header with two latches > so it boils down to how we disambiguate those in the end (we seem > to unify the latches via a forwarder). IIRC OMP lowering builds > loops itself, could it not do the appropriate disambiguation itself?
I realized I emit the same stmts on both paths (before goto doit; and before falling through it), at least the MIN_EXPR and PLUS_EXPR, so by forcing there an extra bb which does those two and having the "doit" label before that the innermost loop doesn't have multiple latches anymore and so is vectorized fine. Will commit this after full bootstrap/regtest. Thanks. 2020-10-06 Jakub Jelinek <ja...@redhat.com> * omp-expand.c (expand_omp_simd): Don't emit MIN_EXPR and PLUS_EXPR at the end of entry_bb and innermost init_bb, instead force arguments for MIN_EXPR into temporaries in both cases and jump to a new bb that performs MIN_EXPR and PLUS_EXPR. * gcc.dg/gomp/simd-2.c: New test. * gcc.dg/gomp/simd-3.c: New test. --- gcc/omp-expand.c.jj 2020-09-26 10:09:57.524001314 +0200 +++ gcc/omp-expand.c 2020-10-06 13:38:14.295073351 +0200 @@ -6347,6 +6347,7 @@ expand_omp_simd (struct omp_region *regi tree n2var = NULL_TREE; tree n2v = NULL_TREE; tree *nonrect_bounds = NULL; + tree min_arg1 = NULL_TREE, min_arg2 = NULL_TREE; if (fd->collapse > 1) { if (broken_loop || gimple_omp_for_combined_into_p (fd->for_stmt)) @@ -6406,9 +6407,10 @@ expand_omp_simd (struct omp_region *regi fold_convert (itype, fd->loops[i].step)); t = fold_convert (type, t); tree t2 = fold_build2 (MINUS_EXPR, type, n2, n1); - t = fold_build2 (MIN_EXPR, type, t2, t); - t = fold_build2 (PLUS_EXPR, type, fd->loop.v, t); - expand_omp_build_assign (&gsi, n2var, t); + min_arg1 = create_tmp_var (type); + expand_omp_build_assign (&gsi, min_arg1, t2); + min_arg2 = create_tmp_var (type); + expand_omp_build_assign (&gsi, min_arg2, t); } else { @@ -6815,7 +6817,16 @@ expand_omp_simd (struct omp_region *regi } else t = counts[i + 1]; - t = fold_build2 (MIN_EXPR, type, t2, t); + expand_omp_build_assign (&gsi, min_arg1, t2); + expand_omp_build_assign (&gsi, min_arg2, t); + e = split_block (init_bb, last_stmt (init_bb)); + gsi = gsi_after_labels (e->dest); + init_bb = e->dest; + remove_edge (FALLTHRU_EDGE (entry_bb)); + make_edge (entry_bb, init_bb, EDGE_FALLTHRU); + set_immediate_dominator (CDI_DOMINATORS, init_bb, entry_bb); + set_immediate_dominator (CDI_DOMINATORS, l1_bb, init_bb); + t = fold_build2 (MIN_EXPR, type, min_arg1, min_arg2); t = fold_build2 (PLUS_EXPR, type, fd->loop.v, t); expand_omp_build_assign (&gsi, n2var, t); } --- gcc/testsuite/gcc.dg/gomp/simd-2.c.jj 2020-10-06 13:33:53.568870663 +0200 +++ gcc/testsuite/gcc.dg/gomp/simd-2.c 2020-10-06 13:32:59.674655600 +0200 @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fopenmp -fdump-tree-vect-details" } */ +/* { dg-additional-options "-mavx" { target avx } } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-9]\[0-9]* loops in function" 5 "vect" } } */ + +int a[10000][128]; + +void +foo (void) +{ + #pragma omp for simd schedule (simd: dynamic, 32) collapse(2) + for (int i = 0; i < 10000; i++) + for (int j = 0; j < 128; j++) + a[i][j] += 3; +} + +void +bar (void) +{ + #pragma omp parallel for simd schedule (simd: dynamic, 32) collapse(2) + for (int i = 0; i < 10000; i++) + for (int j = 0; j < 128; j++) + a[i][j] += 3; +} + +void +baz (void) +{ + #pragma omp distribute parallel for simd schedule (simd: dynamic, 32) collapse(2) + for (int i = 0; i < 10000; i++) + for (int j = 0; j < 128; j++) + a[i][j] += 3; +} + +void +qux (void) +{ + #pragma omp distribute simd dist_schedule (static, 128) collapse(2) + for (int i = 0; i < 10000; i++) + for (int j = 0; j < 128; j++) + a[i][j] += 3; +} + +void +corge (void) +{ + #pragma omp taskloop simd collapse(2) + for (int i = 0; i < 10000; i++) + for (int j = 0; j < 128; j++) + a[i][j] += 3; +} --- gcc/testsuite/gcc.dg/gomp/simd-3.c.jj 2020-10-06 13:33:59.543783638 +0200 +++ gcc/testsuite/gcc.dg/gomp/simd-3.c 2020-10-06 13:36:25.650655684 +0200 @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fopenmp -fdump-tree-vect-details" } */ +/* { dg-additional-options "-mavx" { target avx } } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-9]\[0-9]* loops in function" 5 "vect" } } */ + +int a[1024][1024]; + +void +foo (void) +{ + #pragma omp for simd collapse(2) + for (int i = 0; i < 1024; i++) + for (int j = 0; j < i; j++) + a[i][j] += 3; +} + +void +bar (void) +{ + #pragma omp parallel for simd collapse(2) + for (int i = 0; i < 1024; i++) + for (int j = 0; j < i; j++) + a[i][j] += 3; +} + +void +baz (void) +{ + #pragma omp distribute parallel for simd collapse(2) + for (int i = 0; i < 1024; i++) + for (int j = 0; j < i; j++) + a[i][j] += 3; +} + +void +qux (void) +{ + #pragma omp distribute simd collapse(2) + for (int i = 0; i < 1024; i++) + for (int j = 0; j < i; j++) + a[i][j] += 3; +} + +void +corge (void) +{ + #pragma omp taskloop simd collapse(2) + for (int i = 0; i < 1024; i++) + for (int j = 0; j < i; j++) + a[i][j] += 3; +} Jakub