I noticed that LIM could not hoist vector invariant, and that is why my first implementation tries to hoist them all.
In addition, there are two disadvantages of hoisting invariant load + lim method: First, for some instructions the scalar version is faster than the vector version, and in this case hoisting scalar instructions before vectorization is better. Those instructions include data packing/unpacking, integer multiplication with SSE2, etc.. Second, it may use more SIMD registers. The following code shows a simple example: char *a, *b, *c; for (int i = 0; i < N; ++i) a[i] = b[0] * c[0] + a[i]; Vectorizing b[0]*c[0] is worse than loading the result of b[0]*c[0] into a vector. thanks, Cong On Mon, Jan 13, 2014 at 5:37 AM, Richard Biener <rguent...@suse.de> wrote: > On Wed, 27 Nov 2013, Jakub Jelinek wrote: > >> On Wed, Nov 27, 2013 at 10:53:56AM +0100, Richard Biener wrote: >> > Hmm. I'm still thinking that we should handle this during the regular >> > transform step. >> >> I wonder if it can't be done instead just in vectorizable_load, >> if LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo) and the load is >> invariant, just emit the (broadcasted) load not inside of the loop, but on >> the loop preheader edge. > > So this implements this suggestion, XFAILing the no longer handled cases. > For example we get > > _94 = *b_8(D); > vect_cst_.18_95 = {_94, _94, _94, _94}; > _99 = prolog_loop_adjusted_niters.9_132 * 4; > vectp_a.22_98 = a_6(D) + _99; > ivtmp.43_77 = (unsigned long) vectp_a.22_98; > > <bb 13>: > # ivtmp.41_67 = PHI <ivtmp.41_70(3), 0(12)> > # ivtmp.43_71 = PHI <ivtmp.43_69(3), ivtmp.43_77(12)> > vect__10.19_97 = vect_cst_.18_95 + { 1, 1, 1, 1 }; > _76 = (void *) ivtmp.43_71; > MEM[base: _76, offset: 0B] = vect__10.19_97; > > ... > > instead of having hoisted *b_8 + 1 as scalar computation. Not sure > why LIM doesn't hoist the vector variant later. > > vect__10.19_97 = vect_cst_.18_95 + vect_cst_.20_96; > invariant up to level 1, cost 1. > > ah, the cost thing. Should be "improved" to see that hoisting > reduces the number of live SSA names in the loop. > > Eventually lower_vector_ssa could optimize vector to scalar > code again ... (ick). > > Bootstrap / regtest running on x86_64. > > Comments? > > Thanks, > Richard. > > 2014-01-13 Richard Biener <rguent...@suse.de> > > PR tree-optimization/58921 > PR tree-optimization/59006 > * tree-vect-loop-manip.c (vect_loop_versioning): Remove code > hoisting invariant stmts. > * tree-vect-stmts.c (vectorizable_load): Insert the splat of > invariant loads on the preheader edge if possible. > > * gcc.dg/torture/pr58921.c: New testcase. > * gcc.dg/torture/pr59006.c: Likewise. > * gcc.dg/vect/pr58508.c: XFAIL no longer handled cases. > > Index: gcc/tree-vect-loop-manip.c > =================================================================== > *** gcc/tree-vect-loop-manip.c (revision 206576) > --- gcc/tree-vect-loop-manip.c (working copy) > *************** vect_loop_versioning (loop_vec_info loop > *** 2435,2507 **** > } > } > > - > - /* Extract load statements on memrefs with zero-stride accesses. */ > - > - if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) > - { > - /* In the loop body, we iterate each statement to check if it is a > load. > - Then we check the DR_STEP of the data reference. If DR_STEP is zero, > - then we will hoist the load statement to the loop preheader. */ > - > - basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); > - int nbbs = loop->num_nodes; > - > - for (int i = 0; i < nbbs; ++i) > - { > - for (gimple_stmt_iterator si = gsi_start_bb (bbs[i]); > - !gsi_end_p (si);) > - { > - gimple stmt = gsi_stmt (si); > - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); > - struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); > - > - if (is_gimple_assign (stmt) > - && (!dr > - || (DR_IS_READ (dr) && integer_zerop (DR_STEP (dr))))) > - { > - bool hoist = true; > - ssa_op_iter iter; > - tree var; > - > - /* We hoist a statement if all SSA uses in it are defined > - outside of the loop. */ > - FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_USE) > - { > - gimple def = SSA_NAME_DEF_STMT (var); > - if (!gimple_nop_p (def) > - && flow_bb_inside_loop_p (loop, gimple_bb (def))) > - { > - hoist = false; > - break; > - } > - } > - > - if (hoist) > - { > - if (dr) > - gimple_set_vuse (stmt, NULL); > - > - gsi_remove (&si, false); > - gsi_insert_on_edge_immediate (loop_preheader_edge > (loop), > - stmt); > - > - if (dump_enabled_p ()) > - { > - dump_printf_loc > - (MSG_NOTE, vect_location, > - "hoisting out of the vectorized loop: "); > - dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); > - dump_printf (MSG_NOTE, "\n"); > - } > - continue; > - } > - } > - gsi_next (&si); > - } > - } > - } > - > /* End loop-exit-fixes after versioning. */ > > if (cond_expr_stmt_list) > --- 2435,2440 ---- > Index: gcc/tree-vect-stmts.c > =================================================================== > *** gcc/tree-vect-stmts.c (revision 206576) > --- gcc/tree-vect-stmts.c (working copy) > *************** vectorizable_load (gimple stmt, gimple_s > *** 6368,6378 **** > /* 4. Handle invariant-load. */ > if (inv_p && !bb_vinfo) > { > - gimple_stmt_iterator gsi2 = *gsi; > gcc_assert (!grouped_load); > ! gsi_next (&gsi2); > ! new_temp = vect_init_vector (stmt, scalar_dest, > ! vectype, &gsi2); > new_stmt = SSA_NAME_DEF_STMT (new_temp); > } > > --- 6368,6402 ---- > /* 4. Handle invariant-load. */ > if (inv_p && !bb_vinfo) > { > gcc_assert (!grouped_load); > ! /* If we have versioned for aliasing then we are sure > ! this is a loop invariant load and thus we can insert > ! it on the preheader edge. */ > ! if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) > ! { > ! if (dump_enabled_p ()) > ! { > ! dump_printf_loc (MSG_NOTE, vect_location, > ! "hoisting out of the vectorized " > ! "loop: "); > ! dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); > ! dump_printf (MSG_NOTE, "\n"); > ! } > ! tree tem = copy_ssa_name (scalar_dest, NULL); > ! gsi_insert_on_edge_immediate > ! (loop_preheader_edge (loop), > ! gimple_build_assign (tem, > ! unshare_expr > ! (gimple_assign_rhs1 (stmt)))); > ! new_temp = vect_init_vector (stmt, tem, vectype, NULL); > ! } > ! else > ! { > ! gimple_stmt_iterator gsi2 = *gsi; > ! gsi_next (&gsi2); > ! new_temp = vect_init_vector (stmt, scalar_dest, > ! vectype, &gsi2); > ! } > new_stmt = SSA_NAME_DEF_STMT (new_temp); > } > > Index: gcc/testsuite/gcc.dg/torture/pr58921.c > =================================================================== > *** gcc/testsuite/gcc.dg/torture/pr58921.c (revision 0) > --- gcc/testsuite/gcc.dg/torture/pr58921.c (working copy) > *************** > *** 0 **** > --- 1,17 ---- > + /* { dg-do compile } */ > + > + int a[7]; > + int b; > + > + void > + fn1 () > + { > + for (; b; b++) > + a[b] = ((a[b] <= 0) == (a[0] != 0)); > + } > + > + int > + main () > + { > + return 0; > + } > Index: gcc/testsuite/gcc.dg/torture/pr59006.c > =================================================================== > *** gcc/testsuite/gcc.dg/torture/pr59006.c (revision 0) > --- gcc/testsuite/gcc.dg/torture/pr59006.c (working copy) > *************** > *** 0 **** > --- 1,13 ---- > + /* { dg-do compile } */ > + > + int a[8], b; > + void fn1(void) > + { > + int c; > + for (; b; b++) > + { > + int d = a[b]; > + c = a[0] ? d : 0; > + a[b] = c; > + } > + } > Index: gcc/testsuite/gcc.dg/vect/pr58508.c > =================================================================== > *** gcc/testsuite/gcc.dg/vect/pr58508.c (revision 206576) > --- gcc/testsuite/gcc.dg/vect/pr58508.c (working copy) > *************** void test5 (int* a, int* b) > *** 66,70 **** > } > } > > ! /* { dg-final { scan-tree-dump-times "hoist" 8 "vect" { xfail vect_no_align > } } } */ > /* { dg-final { cleanup-tree-dump "vect" } } */ > --- 66,71 ---- > } > } > > ! /* { dg-final { scan-tree-dump-times "hoist" 8 "vect" { xfail *-*-* } } } */ > ! /* { dg-final { scan-tree-dump-times "hoist" 3 "vect" { xfail vect_no_align > } } } */ > /* { dg-final { cleanup-tree-dump "vect" } } */