This patch adjusts the cost handling on VMAT_GATHER_SCATTER in function vectorizable_load. We don't call function vect_model_load_cost for it any more.
It's mainly for gather loads with IFN or emulated gather loads, it follows the handlings in function vect_model_load_cost. This patch shouldn't have any functional changes. gcc/ChangeLog: * tree-vect-stmts.cc (vectorizable_load): Adjust the cost handling on VMAT_GATHER_SCATTER without calling vect_model_load_cost. (vect_model_load_cost): Adjut the assertion on VMAT_GATHER_SCATTER, remove VMAT_GATHER_SCATTER related handlings and the related parameter gs_info. --- gcc/tree-vect-stmts.cc | 123 +++++++++++++++++++++++++---------------- 1 file changed, 75 insertions(+), 48 deletions(-) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 651dc800380..a3fd0bf879e 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -1131,11 +1131,10 @@ vect_model_load_cost (vec_info *vinfo, vect_memory_access_type memory_access_type, dr_alignment_support alignment_support_scheme, int misalignment, - gather_scatter_info *gs_info, slp_tree slp_node, stmt_vector_for_cost *cost_vec) { - gcc_assert ((memory_access_type != VMAT_GATHER_SCATTER || !gs_info->decl) + gcc_assert (memory_access_type != VMAT_GATHER_SCATTER && memory_access_type != VMAT_INVARIANT && memory_access_type != VMAT_ELEMENTWISE && memory_access_type != VMAT_STRIDED_SLP); @@ -1222,35 +1221,9 @@ vect_model_load_cost (vec_info *vinfo, group_size); } - /* The loads themselves. */ - if (memory_access_type == VMAT_GATHER_SCATTER) - { - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - unsigned int assumed_nunits = vect_nunits_for_cost (vectype); - if (memory_access_type == VMAT_GATHER_SCATTER - && gs_info->ifn == IFN_LAST && !gs_info->decl) - /* For emulated gathers N offset vector element extracts - (we assume the scalar scaling and ptr + offset add is consumed by - the load). */ - inside_cost += record_stmt_cost (cost_vec, ncopies * assumed_nunits, - vec_to_scalar, stmt_info, 0, - vect_body); - /* N scalar loads plus gathering them into a vector. */ - inside_cost += record_stmt_cost (cost_vec, - ncopies * assumed_nunits, - scalar_load, stmt_info, 0, vect_body); - } - else - vect_get_load_cost (vinfo, stmt_info, ncopies, - alignment_support_scheme, misalignment, first_stmt_p, - &inside_cost, &prologue_cost, - cost_vec, cost_vec, true); - - if (memory_access_type == VMAT_GATHER_SCATTER - && gs_info->ifn == IFN_LAST - && !gs_info->decl) - inside_cost += record_stmt_cost (cost_vec, ncopies, vec_construct, - stmt_info, 0, vect_body); + vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme, + misalignment, first_stmt_p, &inside_cost, &prologue_cost, + cost_vec, cost_vec, true); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -10137,6 +10110,7 @@ vectorizable_load (vec_info *vinfo, } tree vec_mask = NULL_TREE; poly_uint64 group_elt = 0; + unsigned int inside_cost = 0; for (j = 0; j < ncopies; j++) { /* 1. Create the vector or array pointer update chain. */ @@ -10268,23 +10242,25 @@ vectorizable_load (vec_info *vinfo, /* Record that VEC_ARRAY is now dead. */ vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); } - else if (!costing_p) + else { for (i = 0; i < vec_num; i++) { tree final_mask = NULL_TREE; - if (loop_masks - && memory_access_type != VMAT_INVARIANT) - final_mask = vect_get_loop_mask (gsi, loop_masks, - vec_num * ncopies, - vectype, vec_num * j + i); - if (vec_mask) - final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, - final_mask, vec_mask, gsi); - - if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) - dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, - gsi, stmt_info, bump); + if (!costing_p) + { + if (loop_masks && memory_access_type != VMAT_INVARIANT) + final_mask + = vect_get_loop_mask (gsi, loop_masks, vec_num * ncopies, + vectype, vec_num * j + i); + if (vec_mask) + final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, + final_mask, vec_mask, gsi); + + if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, + gsi, stmt_info, bump); + } /* 2. Create the vector-load in the loop. */ switch (alignment_support_scheme) @@ -10298,6 +10274,16 @@ vectorizable_load (vec_info *vinfo, if (memory_access_type == VMAT_GATHER_SCATTER && gs_info.ifn != IFN_LAST) { + if (costing_p) + { + unsigned int cnunits + = vect_nunits_for_cost (vectype); + inside_cost + = record_stmt_cost (cost_vec, cnunits, + scalar_load, stmt_info, 0, + vect_body); + goto vec_num_loop_costing_end; + } if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) vec_offset = vec_offsets[vec_num * j + i]; tree zero = build_zero_cst (vectype); @@ -10322,6 +10308,25 @@ vectorizable_load (vec_info *vinfo, gcc_assert (!final_mask); unsigned HOST_WIDE_INT const_nunits = nunits.to_constant (); + if (costing_p) + { + /* For emulated gathers N offset vector element + offset add is consumed by the load). */ + inside_cost + = record_stmt_cost (cost_vec, const_nunits, + vec_to_scalar, stmt_info, 0, + vect_body); + /* N scalar loads plus gathering them into a + vector. */ + inside_cost + = record_stmt_cost (cost_vec, const_nunits, + scalar_load, stmt_info, 0, + vect_body); + inside_cost + = record_stmt_cost (cost_vec, 1, vec_construct, + stmt_info, 0, vect_body); + goto vec_num_loop_costing_end; + } unsigned HOST_WIDE_INT const_offset_nunits = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype) .to_constant (); @@ -10374,6 +10379,9 @@ vectorizable_load (vec_info *vinfo, break; } + if (costing_p) + goto vec_num_loop_costing_end; + align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info)); if (alignment_support_scheme == dr_aligned) @@ -10544,6 +10552,8 @@ vectorizable_load (vec_info *vinfo, } case dr_explicit_realign: { + if (costing_p) + goto vec_num_loop_costing_end; tree ptr, bump; tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype)); @@ -10606,6 +10616,8 @@ vectorizable_load (vec_info *vinfo, } case dr_explicit_realign_optimized: { + if (costing_p) + goto vec_num_loop_costing_end; if (TREE_CODE (dataref_ptr) == SSA_NAME) new_temp = copy_ssa_name (dataref_ptr); else @@ -10702,10 +10714,14 @@ vectorizable_load (vec_info *vinfo, gsi, stmt_info, bump); group_elt = 0; } +vec_num_loop_costing_end: + ; } /* Bump the vector pointer to account for a gap or for excess elements loaded for a permuted SLP load. */ - if (maybe_ne (group_gap_adj, 0U) && slp_perm) + if (!costing_p + && maybe_ne (group_gap_adj, 0U) + && slp_perm) { poly_wide_int bump_val = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) @@ -10754,9 +10770,20 @@ vectorizable_load (vec_info *vinfo, *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; if (costing_p) - vect_model_load_cost (vinfo, stmt_info, ncopies, vf, memory_access_type, - alignment_support_scheme, misalignment, &gs_info, - slp_node, cost_vec); + { + if (memory_access_type == VMAT_GATHER_SCATTER) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vect_model_load_cost: inside_cost = %u, " + "prologue_cost = 0 .\n", + inside_cost); + } + else + vect_model_load_cost (vinfo, stmt_info, ncopies, vf, memory_access_type, + alignment_support_scheme, misalignment, slp_node, + cost_vec); + } return true; } -- 2.31.1