On Mon, Jan 09, 2017 at 11:08:08AM +0100, Richard Biener wrote: > > > > There is one thing my patch doesn't do but should for efficiency, if > > > > loop1 > > > > (outer loop) is not successfully outer-loop vectorized, then we should > > > > mark > > > > loop2 (its inner loop) as dont_vectorize if the outer loop has been > > > > LOOP_VECTORIZED guarded. Then the gcc.dg/gomp/pr68128-1.c change > > > > wouldn't be actually needed. > > (*) > Ok, I don't have too many spare cycles either so can you fix (*)? Then > we can go with the extra versionings for GCC 7 for the moment and if any > of us has enough time to revisit this soon we can.
Here is untested (except for the affected testcases) patch to do that, ok if it passes bootstrap/regtest? I'll file [8 Regression] with details what we want to undo and do for GCC 8. 2017-01-09 Jakub Jelinek <ja...@redhat.com> PR tree-optimization/78899 * tree-if-conv.c (version_loop_for_if_conversion): Instead of returning bool return struct loop *, NULL for failure and the new loop on success. (versionable_outer_loop_p): Don't version outer loop if it has dont_vectorized bit set. (tree_if_conversion): When versioning outer loop, ensure tree_if_conversion is performed also on the inner loop of the non-vectorizable outer loop copy. * tree-vectorizer.c (set_uid_loop_bbs): Formatting fix. Fold LOOP_VECTORIZED in inner loop of the scalar outer loop and prevent vectorization of it. (vectorize_loops): For outer + inner LOOP_VECTORIZED, ensure the outer loop vectorization of the non-scalar version is attempted before vectorization of the inner loop in scalar version. If outer LOOP_VECTORIZED guarded loop is not vectorized, prevent vectorization of its inner loop. * tree-vect-loop-manip.c (rename_variables_in_bb): If outer_loop has 2 inner loops, rename also on edges from bb whose single pred is outer_loop->header. Fix typo in function comment. * gcc.target/i386/pr78899.c: New test. * gcc.dg/pr71077.c: New test. --- gcc/tree-if-conv.c.jj 2017-01-06 19:34:04.052560851 +0100 +++ gcc/tree-if-conv.c 2017-01-09 11:52:59.154806369 +0100 @@ -2535,7 +2535,7 @@ combine_blocks (struct loop *loop) loop to execute. The vectorizer pass will fold this internal call into either true or false. */ -static bool +static struct loop * version_loop_for_if_conversion (struct loop *loop) { basic_block cond_bb; @@ -2566,7 +2566,7 @@ version_loop_for_if_conversion (struct l ifc_bbs[i]->aux = saved_preds[i]; if (new_loop == NULL) - return false; + return NULL; new_loop->dont_vectorize = true; new_loop->force_vectorize = false; @@ -2574,7 +2574,7 @@ version_loop_for_if_conversion (struct l gimple_call_set_arg (g, 1, build_int_cst (integer_type_node, new_loop->num)); gsi_insert_before (&gsi, g, GSI_SAME_STMT); update_ssa (TODO_update_ssa); - return true; + return new_loop; } /* Return true when LOOP satisfies the follow conditions that will @@ -2594,6 +2594,7 @@ static bool versionable_outer_loop_p (struct loop *loop) { if (!loop_outer (loop) + || loop->dont_vectorize || !loop->inner || loop->inner->next || !single_exit (loop) @@ -2602,7 +2603,7 @@ versionable_outer_loop_p (struct loop *l || !single_pred_p (loop->latch) || !single_pred_p (loop->inner->latch)) return false; - + basic_block outer_exit = single_pred (loop->latch); basic_block inner_exit = single_pred (loop->inner->latch); @@ -2789,7 +2790,10 @@ tree_if_conversion (struct loop *loop) { unsigned int todo = 0; bool aggressive_if_conv; + struct loop *rloop; + again: + rloop = NULL; ifc_bbs = NULL; any_pred_load_store = false; any_complicated_phi = false; @@ -2829,8 +2833,31 @@ tree_if_conversion (struct loop *loop) struct loop *vloop = (versionable_outer_loop_p (loop_outer (loop)) ? loop_outer (loop) : loop); - if (!version_loop_for_if_conversion (vloop)) + struct loop *nloop = version_loop_for_if_conversion (vloop); + if (nloop == NULL) goto cleanup; + if (vloop != loop) + { + /* If versionable_outer_loop_p decided to version the + outer loop, version also the inner loop of the non-vectorized + loop copy. So we transform: + loop1 + loop2 + into: + if (LOOP_VECTORIZED (1, 3)) + { + loop1 + loop2 + } + else + loop3 (copy of loop1) + if (LOOP_VECTORIZED (4, 5)) + loop4 (copy of loop2) + else + loop5 (copy of loop4) */ + gcc_assert (nloop->inner && nloop->inner->next == NULL); + rloop = nloop->inner; + } } /* Now all statements are if-convertible. Combine all the basic @@ -2854,6 +2881,11 @@ tree_if_conversion (struct loop *loop) free (ifc_bbs); ifc_bbs = NULL; } + if (rloop != NULL) + { + loop = rloop; + goto again; + } return todo; } --- gcc/tree-vectorizer.c.jj 2017-01-06 19:34:04.064560697 +0100 +++ gcc/tree-vectorizer.c 2017-01-09 12:04:08.721053174 +0100 @@ -465,6 +465,7 @@ fold_loop_vectorized_call (gimple *g, tr update_stmt (use_stmt); } } + /* Set the uids of all the statements in basic blocks inside loop represented by LOOP_VINFO. LOOP_VECTORIZED_CALL is the internal call guarding the loop which has been if converted. */ @@ -477,9 +478,22 @@ set_uid_loop_bbs (loop_vec_info loop_vin struct loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg)); LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = scalar_loop; - gcc_checking_assert (vect_loop_vectorized_call - (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) + gcc_checking_assert (vect_loop_vectorized_call (scalar_loop) == loop_vectorized_call); + /* If we are going to vectorize outer loop, prevent vectorization + of the inner loop in the scalar loop - either the scalar loop is + thrown away, so it is a wasted work, or is used only for + a few iterations. */ + if (scalar_loop->inner) + { + gimple *g = vect_loop_vectorized_call (scalar_loop->inner); + if (g) + { + arg = gimple_call_arg (g, 0); + get_loop (cfun, tree_to_shwi (arg))->dont_vectorize = true; + fold_loop_vectorized_call (g, boolean_false_node); + } + } bbs = get_loop_body (scalar_loop); for (i = 0; i < scalar_loop->num_nodes; i++) { @@ -534,14 +548,59 @@ vectorize_loops (void) only over initial loops skipping newly generated ones. */ FOR_EACH_LOOP (loop, 0) if (loop->dont_vectorize) - any_ifcvt_loops = true; - else if ((flag_tree_loop_vectorize - && optimize_loop_nest_for_speed_p (loop)) - || loop->force_vectorize) { - loop_vec_info loop_vinfo, orig_loop_vinfo = NULL; - gimple *loop_vectorized_call = vect_loop_vectorized_call (loop); -vectorize_epilogue: + any_ifcvt_loops = true; + /* If-conversion sometimes versions both the outer loop + (for the case when outer loop vectorization might be + desirable) as well as the inner loop in the scalar version + of the loop. So we have: + if (LOOP_VECTORIZED (1, 3)) + { + loop1 + loop2 + } + else + loop3 (copy of loop1) + if (LOOP_VECTORIZED (4, 5)) + loop4 (copy of loop2) + else + loop5 (copy of loop4) + If FOR_EACH_LOOP gives us loop3 first (which has + dont_vectorize set), make sure to process loop1 before loop4; + so that we can prevent vectorization of loop4 if loop1 + is successfully vectorized. */ + if (loop->inner) + { + gimple *loop_vectorized_call + = vect_loop_vectorized_call (loop); + if (loop_vectorized_call + && vect_loop_vectorized_call (loop->inner)) + { + tree arg = gimple_call_arg (loop_vectorized_call, 0); + struct loop *vector_loop + = get_loop (cfun, tree_to_shwi (arg)); + if (vector_loop && vector_loop != loop) + { + loop = vector_loop; + /* Make sure we don't vectorize it twice. */ + loop->dont_vectorize = true; + goto try_vectorize; + } + } + } + } + else + { + loop_vec_info loop_vinfo, orig_loop_vinfo; + gimple *loop_vectorized_call; + try_vectorize: + if (!((flag_tree_loop_vectorize + && optimize_loop_nest_for_speed_p (loop)) + || loop->force_vectorize)) + continue; + orig_loop_vinfo = NULL; + loop_vectorized_call = vect_loop_vectorized_call (loop); + vectorize_epilogue: vect_location = find_loop_location (loop); if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION && dump_enabled_p ()) @@ -595,6 +654,12 @@ vectorize_epilogue: ret |= TODO_cleanup_cfg; } } + /* If outer loop vectorization fails for LOOP_VECTORIZED guarded + loop, don't vectorize its inner loop; we'll attempt to + vectorize LOOP_VECTORIZED guarded inner loop of the scalar + loop version. */ + if (loop_vectorized_call && loop->inner) + loop->inner->dont_vectorize = true; continue; } --- gcc/tree-vect-loop-manip.c.jj 2017-01-06 19:34:04.040561006 +0100 +++ gcc/tree-vect-loop-manip.c 2017-01-09 11:52:59.155806356 +0100 @@ -71,7 +71,7 @@ rename_use_op (use_operand_p op_p) } -/* Renames the variables in basic block BB. Allow renaming of PHI argumnets +/* Renames the variables in basic block BB. Allow renaming of PHI arguments on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is true. */ @@ -102,9 +102,25 @@ rename_variables_in_bb (basic_block bb, FOR_EACH_EDGE (e, ei, bb->preds) { - if (!flow_bb_inside_loop_p (loop, e->src) - && (!rename_from_outer_loop || e->src != outer_loop->header)) - continue; + if (!flow_bb_inside_loop_p (loop, e->src)) + { + if (!rename_from_outer_loop) + continue; + if (e->src != outer_loop->header) + { + if (outer_loop->inner->next) + { + /* If outer_loop has 2 inner loops, allow there to + be an extra basic block which decides which of the + two loops to use using LOOP_VECTORIZED. */ + if (!single_pred_p (e->src) + || single_pred (e->src) != outer_loop->header) + continue; + } + else + continue; + } + } for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e)); --- gcc/testsuite/gcc.target/i386/pr78899.c.jj 2017-01-09 11:52:59.156806343 +0100 +++ gcc/testsuite/gcc.target/i386/pr78899.c 2017-01-09 11:52:59.156806343 +0100 @@ -0,0 +1,27 @@ +/* PR tree-optimization/78899 */ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fopenmp-simd -mavx2 -mno-avx512f" } */ + +#define N 1024 +#define M 4 +int p1[N], p2[N], p3[N], c[N]; + +void +foo (int n) +{ + int i, k; + for (k = 0; k < n / M; k++) + { + #pragma omp simd + for (i = 0; i < M; i++) + if (c[k * M + i]) + { + p1[k * M + i] += 1; + p2[k * M + i] = p3[k * M + i] + 2; + } + } +} + +/* Ensure the loop is vectorized. */ +/* { dg-final { scan-assembler "vpmaskmov" } } */ +/* { dg-final { scan-assembler "vpadd" } } */ --- gcc/testsuite/gcc.dg/pr71077.c.jj 2017-01-09 11:52:59.179806042 +0100 +++ gcc/testsuite/gcc.dg/pr71077.c 2017-01-09 11:52:59.179806042 +0100 @@ -0,0 +1,14 @@ +/* PR c++/71077 */ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ +/* { dg-additional-options "-mavx2" { target { i?86-*-* x86_64-*-* } } } */ + +void +foo (int *a, int n) +{ + int b, c; + for (b = 0; b < n; b++) + for (c = 0; c < 32; c++) + if ((b & 1U) << c) + a[b + c] = 0; +} Jakub