Hi! If-conversion can't easily predict whether outer loop vectorization will be successful or not. In GCC 6, we'd version and guard with LOOP_VECTORIZED only the inner loop, which is beneficial if the outer loop vectorization isn't successful, but inner loop vectorization is. This changed last year, and now we sometimes version the outer loop instead. The problem with that is that it regresses according to the PR some important benchmarks where the outer loop can't be really vectorized but if-conv doesn't know that, but because the inner loop needs masked loads/stores, the inner loop isn't vectorized either.
The following patch tweaks tree-if-conv.c so that when it wants to version an outer loop, it actually transforms: loop1 loop2 into: if (LOOP_VECTORIZED (1, 3)) { loop1 loop2 } else loop3 (copy of loop1) if (LOOP_VECTORIZED (4, 5)) loop4 (copy of loop2) else loop5 (copy of loop4) and tweaks the vectorizer, so that it attempts to vectorize always the outer loop (i.e. loop1) in this case first. If that is successful, it marks loop4 as non-vectorizable and folds the latter LOOP_VECTORIZED, so that in the copies of the scalar loop the inner loop isn't vectorized. If outer loop vectorization fails, loop1/loop2 are thrown away and we get loop3 non-vectorized with either loop4 vectorized, or if even that fails, with loop5 inside of it. I think vectorization in the scalar loop of the outer loop is code size waste, it will be just very small number of iterations of the outer loop. If you think we should vectorize even there, we'd need other changes - handle LOOP_VECTORIZED ifn during copying of loops in tree-vect-loop-manip.c and ensure we'd run the vectorizer even on the copied loops - right now the vectorizer runs only on the loops created before vectorization starts (except for epilogue vectorization). Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2017-01-06 Jakub Jelinek <ja...@redhat.com> PR tree-optimization/78899 * tree-if-conv.c (version_loop_for_if_conversion): Instead of returning bool return struct loop *, NULL for failure and the new loop on success. (versionable_outer_loop_p): Don't version outer loop if it has dont_vectorized bit set. (tree_if_conversion): When versioning outer loop, ensure tree_if_conversion is performed also on the inner loop of the non-vectorizable outer loop copy. * tree-vectorizer.c (set_uid_loop_bbs): Formatting fix. Fold LOOP_VECTORIZED in inner loop of the scalar outer loop and prevent vectorization of it. (vectorize_loops): For outer + inner LOOP_VECTORIZED, ensure the outer loop vectorization of the non-scalar version is attempted before vectorization of the inner loop in scalar version. * tree-vect-loop-manip.c (rename_variables_in_bb): If outer_loop has 2 inner loops, rename also on edges from bb whose single pred is outer_loop->header. Fix typo in function comment. * gcc.target/i386/pr78899.c: New test. * gcc.dg/pr71077.c: New test. * gcc.dg/gomp/pr68128-1.c: Adjust allowed number of vectorized loops. --- gcc/tree-if-conv.c.jj 2017-01-04 18:12:50.180878924 +0100 +++ gcc/tree-if-conv.c 2017-01-06 09:24:08.410557481 +0100 @@ -2535,7 +2535,7 @@ combine_blocks (struct loop *loop) loop to execute. The vectorizer pass will fold this internal call into either true or false. */ -static bool +static struct loop * version_loop_for_if_conversion (struct loop *loop) { basic_block cond_bb; @@ -2566,7 +2566,7 @@ version_loop_for_if_conversion (struct l ifc_bbs[i]->aux = saved_preds[i]; if (new_loop == NULL) - return false; + return NULL; new_loop->dont_vectorize = true; new_loop->force_vectorize = false; @@ -2574,7 +2574,7 @@ version_loop_for_if_conversion (struct l gimple_call_set_arg (g, 1, build_int_cst (integer_type_node, new_loop->num)); gsi_insert_before (&gsi, g, GSI_SAME_STMT); update_ssa (TODO_update_ssa); - return true; + return new_loop; } /* Return true when LOOP satisfies the follow conditions that will @@ -2594,6 +2594,7 @@ static bool versionable_outer_loop_p (struct loop *loop) { if (!loop_outer (loop) + || loop->dont_vectorize || !loop->inner || loop->inner->next || !single_exit (loop) @@ -2602,7 +2603,7 @@ versionable_outer_loop_p (struct loop *l || !single_pred_p (loop->latch) || !single_pred_p (loop->inner->latch)) return false; - + basic_block outer_exit = single_pred (loop->latch); basic_block inner_exit = single_pred (loop->inner->latch); @@ -2789,7 +2790,10 @@ tree_if_conversion (struct loop *loop) { unsigned int todo = 0; bool aggressive_if_conv; + struct loop *rloop; + again: + rloop = NULL; ifc_bbs = NULL; any_pred_load_store = false; any_complicated_phi = false; @@ -2829,8 +2833,31 @@ tree_if_conversion (struct loop *loop) struct loop *vloop = (versionable_outer_loop_p (loop_outer (loop)) ? loop_outer (loop) : loop); - if (!version_loop_for_if_conversion (vloop)) + struct loop *nloop = version_loop_for_if_conversion (vloop); + if (nloop == NULL) goto cleanup; + if (vloop != loop) + { + /* If versionable_outer_loop_p decided to version the + outer loop, version also the inner loop of the non-vectorized + loop copy. So we transform: + loop1 + loop2 + into: + if (LOOP_VECTORIZED (1, 3)) + { + loop1 + loop2 + } + else + loop3 (copy of loop1) + if (LOOP_VECTORIZED (4, 5)) + loop4 (copy of loop2) + else + loop5 (copy of loop4) */ + gcc_assert (nloop->inner && nloop->inner->next == NULL); + rloop = nloop->inner; + } } /* Now all statements are if-convertible. Combine all the basic @@ -2854,6 +2881,11 @@ tree_if_conversion (struct loop *loop) free (ifc_bbs); ifc_bbs = NULL; } + if (rloop != NULL) + { + loop = rloop; + goto again; + } return todo; } --- gcc/tree-vectorizer.c.jj 2017-01-04 18:12:49.960881733 +0100 +++ gcc/tree-vectorizer.c 2017-01-06 09:24:08.411557468 +0100 @@ -465,6 +465,7 @@ fold_loop_vectorized_call (gimple *g, tr update_stmt (use_stmt); } } + /* Set the uids of all the statements in basic blocks inside loop represented by LOOP_VINFO. LOOP_VECTORIZED_CALL is the internal call guarding the loop which has been if converted. */ @@ -477,9 +478,22 @@ set_uid_loop_bbs (loop_vec_info loop_vin struct loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg)); LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = scalar_loop; - gcc_checking_assert (vect_loop_vectorized_call - (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) + gcc_checking_assert (vect_loop_vectorized_call (scalar_loop) == loop_vectorized_call); + /* If we are going to vectorize outer loop, prevent vectorization + of the inner loop in the scalar loop - either the scalar loop is + thrown away, so it is a wasted work, or is used only for + a few iterations. */ + if (scalar_loop->inner) + { + gimple *g = vect_loop_vectorized_call (scalar_loop->inner); + if (g) + { + arg = gimple_call_arg (g, 0); + get_loop (cfun, tree_to_shwi (arg))->dont_vectorize = true; + fold_loop_vectorized_call (g, boolean_false_node); + } + } bbs = get_loop_body (scalar_loop); for (i = 0; i < scalar_loop->num_nodes; i++) { @@ -534,14 +548,59 @@ vectorize_loops (void) only over initial loops skipping newly generated ones. */ FOR_EACH_LOOP (loop, 0) if (loop->dont_vectorize) - any_ifcvt_loops = true; - else if ((flag_tree_loop_vectorize - && optimize_loop_nest_for_speed_p (loop)) - || loop->force_vectorize) { - loop_vec_info loop_vinfo, orig_loop_vinfo = NULL; - gimple *loop_vectorized_call = vect_loop_vectorized_call (loop); -vectorize_epilogue: + any_ifcvt_loops = true; + /* If-conversion sometimes versions both the outer loop + (for the case when outer loop vectorization might be + desirable) as well as the inner loop in the scalar version + of the loop. So we have: + if (LOOP_VECTORIZED (1, 3)) + { + loop1 + loop2 + } + else + loop3 (copy of loop1) + if (LOOP_VECTORIZED (4, 5)) + loop4 (copy of loop2) + else + loop5 (copy of loop4) + If FOR_EACH_LOOP gives us loop3 first (which has + dont_vectorize set), make sure to process loop1 before loop4; + so that we can prevent vectorization of loop4 if loop1 + is successfully vectorized. */ + if (loop->inner) + { + gimple *loop_vectorized_call + = vect_loop_vectorized_call (loop); + if (loop_vectorized_call + && vect_loop_vectorized_call (loop->inner)) + { + tree arg = gimple_call_arg (loop_vectorized_call, 0); + struct loop *vector_loop + = get_loop (cfun, tree_to_shwi (arg)); + if (vector_loop && vector_loop != loop) + { + loop = vector_loop; + /* Make sure we don't vectorize it twice. */ + loop->dont_vectorize = true; + goto try_vectorize; + } + } + } + } + else + { + loop_vec_info loop_vinfo, orig_loop_vinfo; + gimple *loop_vectorized_call; + try_vectorize: + if (!((flag_tree_loop_vectorize + && optimize_loop_nest_for_speed_p (loop)) + || loop->force_vectorize)) + continue; + orig_loop_vinfo = NULL; + loop_vectorized_call = vect_loop_vectorized_call (loop); + vectorize_epilogue: vect_location = find_loop_location (loop); if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION && dump_enabled_p ()) --- gcc/tree-vect-loop-manip.c.jj 2017-01-01 12:45:36.000000000 +0100 +++ gcc/tree-vect-loop-manip.c 2017-01-06 10:22:43.248168286 +0100 @@ -71,7 +71,7 @@ rename_use_op (use_operand_p op_p) } -/* Renames the variables in basic block BB. Allow renaming of PHI argumnets +/* Renames the variables in basic block BB. Allow renaming of PHI arguments on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is true. */ @@ -102,9 +102,25 @@ rename_variables_in_bb (basic_block bb, FOR_EACH_EDGE (e, ei, bb->preds) { - if (!flow_bb_inside_loop_p (loop, e->src) - && (!rename_from_outer_loop || e->src != outer_loop->header)) - continue; + if (!flow_bb_inside_loop_p (loop, e->src)) + { + if (!rename_from_outer_loop) + continue; + if (e->src != outer_loop->header) + { + if (outer_loop->inner->next) + { + /* If outer_loop has 2 inner loops, allow there to + be an extra basic block which decides which of the + two loops to use using LOOP_VECTORIZED. */ + if (!single_pred_p (e->src) + || single_pred (e->src) != outer_loop->header) + continue; + } + else + continue; + } + } for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e)); --- gcc/testsuite/gcc.target/i386/pr78899.c.jj 2017-01-06 09:24:08.411557468 +0100 +++ gcc/testsuite/gcc.target/i386/pr78899.c 2017-01-06 09:24:08.411557468 +0100 @@ -0,0 +1,27 @@ +/* PR tree-optimization/78899 */ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fopenmp-simd -mavx2 -mno-avx512f" } */ + +#define N 1024 +#define M 4 +int p1[N], p2[N], p3[N], c[N]; + +void +foo (int n) +{ + int i, k; + for (k = 0; k < n / M; k++) + { + #pragma omp simd + for (i = 0; i < M; i++) + if (c[k * M + i]) + { + p1[k * M + i] += 1; + p2[k * M + i] = p3[k * M + i] + 2; + } + } +} + +/* Ensure the loop is vectorized. */ +/* { dg-final { scan-assembler "vpmaskmov" } } */ +/* { dg-final { scan-assembler "vpadd" } } */ --- gcc/testsuite/gcc.dg/gomp/pr68128-1.c.jj 2017-01-04 18:12:50.033880801 +0100 +++ gcc/testsuite/gcc.dg/gomp/pr68128-1.c 2017-01-06 09:24:08.419557365 +0100 @@ -29,4 +29,4 @@ foo (float *u, float v, float w, float x } } -/* { dg-final { scan-tree-dump "note: vectorized 1 loops in function" "vect" { target i?86-*-* x86_64-*-* } } } */ +/* { dg-final { scan-tree-dump "note: vectorized \[12] loops in function" "vect" { target i?86-*-* x86_64-*-* } } } */ --- gcc/testsuite/gcc.dg/pr71077.c.jj 2017-01-06 09:24:08.420557352 +0100 +++ gcc/testsuite/gcc.dg/pr71077.c 2017-01-06 09:24:08.419557365 +0100 @@ -0,0 +1,14 @@ +/* PR c++/71077 */ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ +/* { dg-additional-options "-mavx2" { target { i?86-*-* x86_64-*-* } } } */ + +void +foo (int *a, int n) +{ + int b, c; + for (b = 0; b < n; b++) + for (c = 0; c < 32; c++) + if ((b & 1U) << c) + a[b + c] = 0; +} Jakub