On Wed, Feb 8, 2012 at 3:39 PM, William J. Schmidt <wschm...@linux.vnet.ibm.com> wrote: > This is a vectorizer patch I inherited from Ira. As with the fix for > PR50969, it handles problems with excessive permutes by adjusting the > cost model. In this case the permutes comes from type > promotion/demotion, which is now modeled with a new vec_cvt cost. > > This restores the lost performance for sphinx3, and gives wupwise an > additional boost. This is a performance workaround for 4.7; for 4.8 we > want to try to do a better job of modeling the real issue, which is that > VSX permutes are constrained to a single processor pipe. Thus isolated > permutes are less expensive per instruction than denser collections of > permutes. > > There also remains an opportunity for versioning loops for alignment. > > Bootstrapped and tested on powerpc64-linux with no regressions. OK for > trunk? > > Thanks, > Bill > > > 2012-02-08 Bill Schmidt <wschm...@linux.vnet.ibm.com> > Ira Rosen <i...@il.ibm.com> > > PR tree-optimization/50031 > * targhooks.c (default_builtin_vectorization_cost): Handle vec_cvt. > * target.h (enum vect_cost_for_stmt): Add vec_cvt. > * tree-vect-loop.c (vect_get_single_scalar_iteraion_cost): Handle > all types of reduction and pattern statements. > (vect_estimate_min_profitable_iters): Likewise. > * tree-vect-stmts.c (vect_model_simple_cost): Use vec_cvt cost for > type promotions and demotions. > (vect_model_conversion_cost): New function. > (vect_get_load_cost): Use vec_perm for permutations; add dump logic > for explicit realigns. > (vectorizable_conversion): Call vect_model_conversion_cost. > * config/spu/spu.c (spu_builtin_vectorization_cost): Handle vec_cvt. > * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. > * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Update > vec_perm for VSX and handle vec_cvt. > > > Index: gcc/targhooks.c > =================================================================== > --- gcc/targhooks.c (revision 183944) > +++ gcc/targhooks.c (working copy) > @@ -514,6 +514,7 @@ default_builtin_vectorization_cost (enum vect_cost > case scalar_to_vec: > case cond_branch_not_taken: > case vec_perm: > + case vec_cvt: > return 1; > > case unaligned_load: > Index: gcc/target.h > =================================================================== > --- gcc/target.h (revision 183944) > +++ gcc/target.h (working copy) > @@ -145,7 +145,8 @@ enum vect_cost_for_stmt > scalar_to_vec, > cond_branch_not_taken, > cond_branch_taken, > - vec_perm > + vec_perm, > + vec_cvt > };
Somewhere we should document what these mean. vec_cvt certainly is non-obvious, especially as it does not apply to int <-> float converts. Maybe vec_promote_demote would be a better name. > /* The target structure. This holds all the backend hooks. */ > Index: gcc/tree-vect-loop.c > =================================================================== > --- gcc/tree-vect-loop.c (revision 183944) > +++ gcc/tree-vect-loop.c (working copy) > @@ -2417,7 +2417,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf > if (stmt_info > && !STMT_VINFO_RELEVANT_P (stmt_info) > && (!STMT_VINFO_LIVE_P (stmt_info) > - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) > + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE > (stmt_info))) > + && !STMT_VINFO_IN_PATTERN_P (stmt_info)) > continue; > > if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) > @@ -2562,17 +2563,51 @@ vect_estimate_min_profitable_iters (loop_vec_info > > for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) > { > - gimple stmt = gsi_stmt (si); > + gimple stmt = gsi_stmt (si), pattern_stmt; > stmt_vec_info stmt_info = vinfo_for_stmt (stmt); > /* Skip stmts that are not vectorized inside the loop. */ > if (!STMT_VINFO_RELEVANT_P (stmt_info) > && (!STMT_VINFO_LIVE_P (stmt_info) > - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) > - continue; > + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE > (stmt_info)))) > + { > + if (STMT_VINFO_IN_PATTERN_P (stmt_info) > + && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info)) > + && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt)) > + || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt)))) > + { > + stmt = pattern_stmt; > + stmt_info = vinfo_for_stmt (stmt); > + } > + else > + continue; > + } It looks super-ugly that way. I think we should _always_ use the main pattern stmt - do we at this point already have generated the vectorized loop? I suppose better not. > vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * > factor; > /* FIXME: for stmts in the inner-loop in outer-loop vectorization, > some of the "outside" costs are generated inside the outer-loop. > */ > vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); > + if (is_pattern_stmt_p (stmt_info) > + && STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)) > + { > + gimple_stmt_iterator gsi; > + > + for (gsi = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)); > + !gsi_end_p (gsi); gsi_next (&gsi)) > + { > + gimple pattern_def_stmt = gsi_stmt (gsi); > + stmt_vec_info pattern_def_stmt_info > + = vinfo_for_stmt (pattern_def_stmt); > + if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info) > + || STMT_VINFO_LIVE_P (pattern_def_stmt_info)) > + { > + vec_inside_cost > + += STMT_VINFO_INSIDE_OF_LOOP_COST > + (pattern_def_stmt_info) * factor; > + vec_outside_cost > + += STMT_VINFO_OUTSIDE_OF_LOOP_COST > + (pattern_def_stmt_info); > + } > + } > + } > } > } > > Index: gcc/tree-vect-stmts.c > =================================================================== > --- gcc/tree-vect-stmts.c (revision 183944) > +++ gcc/tree-vect-stmts.c (working copy) > @@ -787,13 +787,18 @@ vect_model_simple_cost (stmt_vec_info stmt_info, i > { > int i; > int inside_cost = 0, outside_cost = 0; > + enum vect_cost_for_stmt cost = vector_stmt; > > /* The SLP costs were already calculated during SLP tree build. */ > if (PURE_SLP_STMT (stmt_info)) > return; > > - inside_cost = ncopies * vect_get_stmt_cost (vector_stmt); > + if (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type > + || STMT_VINFO_TYPE (stmt_info) == type_demotion_vec_info_type) > + cost = vec_cvt; That looks redundant with the vect_model_conversion_cost you introduce. > + inside_cost = ncopies * vect_get_stmt_cost (cost); > + > /* FORNOW: Assuming maximum 2 args per stmts. */ > for (i = 0; i < 2; i++) > { > @@ -811,6 +816,43 @@ vect_model_simple_cost (stmt_vec_info stmt_info, i > } > > > +/* Model cost for type demotion and promotion operations. */ > + > +static void > +vect_model_conversion_cost (stmt_vec_info stmt_info, > + enum vect_def_type *dt, int pwr) The PWR argument needs documenting. At least I couldn't figure out why we are adding cost proportional to pwr^2 here. > +{ > + int i, tmp; > + int inside_cost = 0, outside_cost = 0, single_stmt_cost; > + > + /* The SLP costs were already calculated during SLP tree build. */ > + if (PURE_SLP_STMT (stmt_info)) > + return; > + > + single_stmt_cost = vect_get_stmt_cost (vec_cvt); > + for (i = 0; i < pwr + 1; i++) > + { > + tmp = (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type) ? > + (i + 1) : i; > + inside_cost += vect_pow2 (tmp) * single_stmt_cost; > + } > + > + /* FORNOW: Assuming maximum 2 args per stmts. */ > + for (i = 0; i < 2; i++) > + { > + if (dt[i] == vect_constant_def || dt[i] == vect_external_def) > + outside_cost += vect_get_stmt_cost (vector_stmt); > + } > + > + if (vect_print_dump_info (REPORT_COST)) > + fprintf (vect_dump, "vect_model_conversion_cost: inside_cost = %d, " > + "outside_cost = %d .", inside_cost, outside_cost); > + > + /* Set the costs in STMT_INFO. */ > + stmt_vinfo_set_inside_of_loop_cost (stmt_info, NULL, inside_cost); > + stmt_vinfo_set_outside_of_loop_cost (stmt_info, NULL, outside_cost); > +} > + > /* Function vect_cost_strided_group_size > > For strided load or store, return the group_size only if it is the first > @@ -887,7 +929,6 @@ vect_model_store_cost (stmt_vec_info stmt_info, in > if (vect_print_dump_info (REPORT_COST)) > fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d > .", > group_size); > - > } > > /* Costs of the stores. */ > @@ -1049,7 +1090,7 @@ vect_get_load_cost (struct data_reference *dr, int > case dr_explicit_realign: > { > *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load) > - + vect_get_stmt_cost (vector_stmt)); > + + vect_get_stmt_cost (vec_perm)); Looks unrelated? > > /* FIXME: If the misalignment remains fixed across the iterations of > the containing loop, the following cost should be added to the > @@ -1057,6 +1098,9 @@ vect_get_load_cost (struct data_reference *dr, int > if (targetm.vectorize.builtin_mask_for_load) > *inside_cost += vect_get_stmt_cost (vector_stmt); > > + if (vect_print_dump_info (REPORT_COST)) > + fprintf (vect_dump, "vect_model_load_cost: explicit realign"); > + > break; > } > case dr_explicit_realign_optimized: > @@ -1080,7 +1124,7 @@ vect_get_load_cost (struct data_reference *dr, int > } > > *inside_cost += ncopies * (vect_get_stmt_cost (vector_load) > - + vect_get_stmt_cost (vector_stmt)); > + + vect_get_stmt_cost (vec_perm)); For consistency a printf is missing here, too. Both changes seem to be unrelated to the fix. > break; > } > > @@ -2392,16 +2436,19 @@ vectorizable_conversion (gimple stmt, gimple_stmt_ > if (vect_print_dump_info (REPORT_DETAILS)) > fprintf (vect_dump, "=== vectorizable_conversion ==="); > if (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR) > - STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type; > + { > + STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type; > + vect_model_simple_cost (stmt_info, ncopies, dt, NULL); > + } > else if (modifier == NARROW) > { > STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type; > - vect_model_simple_cost (stmt_info, ncopies, dt, NULL); > + vect_model_conversion_cost (stmt_info, dt, multi_step_cvt); > } > else > { > STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type; > - vect_model_simple_cost (stmt_info, 2 * ncopies, dt, NULL); > + vect_model_conversion_cost (stmt_info, dt, multi_step_cvt); > } > VEC_free (tree, heap, interm_types); > return true; > Index: gcc/config/spu/spu.c > =================================================================== > --- gcc/config/spu/spu.c (revision 183944) > +++ gcc/config/spu/spu.c (working copy) > @@ -6920,6 +6920,7 @@ spu_builtin_vectorization_cost (enum vect_cost_for > case scalar_to_vec: > case cond_branch_not_taken: > case vec_perm: > + case vec_cvt: > return 1; > > case scalar_store: > Index: gcc/config/i386/i386.c > =================================================================== > --- gcc/config/i386/i386.c (revision 183944) > +++ gcc/config/i386/i386.c (working copy) > @@ -35336,6 +35336,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo > return ix86_cost->cond_not_taken_branch_cost; > > case vec_perm: > + case vec_cvt: > return ix86_cost->vec_stmt_cost; > > default: > Index: gcc/config/rs6000/rs6000.c > =================================================================== > --- gcc/config/rs6000/rs6000.c (revision 183944) > +++ gcc/config/rs6000/rs6000.c (working copy) > @@ -3543,10 +3543,17 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ > return 1; > > case vec_perm: > - if (!TARGET_VSX) > + if (TARGET_VSX) > + return 4; > + else > return 1; > - return 2; > > + case vec_cvt: > + if (TARGET_VSX) > + return 5; > + else > + return 1; > + > case cond_branch_taken: > return 3; > > >