Hi Richard, on 2023/8/14 20:20, Richard Sandiford wrote: > Thanks for the clean-ups. But... > > "Kewen.Lin" <li...@linux.ibm.com> writes: >> Hi, >> >> Following Richi's suggestion [1], this patch is to move the >> handlings on VMAT_GATHER_SCATTER in the final loop nest >> of function vectorizable_load to its own loop. Basically >> it duplicates the final loop nest, clean up some useless >> set up code for the case of VMAT_GATHER_SCATTER, remove some >> unreachable code. Also remove the corresponding handlings >> in the final loop nest. >> >> Bootstrapped and regtested on x86_64-redhat-linux, >> aarch64-linux-gnu and powerpc64{,le}-linux-gnu. >> >> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html >> >> Is it ok for trunk? >> >> BR, >> Kewen >> ----- >> >> gcc/ChangeLog: >> >> * tree-vect-stmts.cc (vectorizable_load): Move the handlings on >> VMAT_GATHER_SCATTER in the final loop nest to its own loop, >> and update the final nest accordingly. >> --- >> gcc/tree-vect-stmts.cc | 361 +++++++++++++++++++++++++---------------- >> 1 file changed, 219 insertions(+), 142 deletions(-) > > ...that seems like quite a lot of +s. Is there nothing we can do to > avoid the cut-&-paste?
Thanks for the comments! I'm not sure if I get your question, if we want to move out the handlings of VMAT_GATHER_SCATTER, the new +s seem inevitable? Your concern is mainly about git blame history? BR, Kewen > > Richard > >> >> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc >> index c361e16cb7b..5e514eca19b 100644 >> --- a/gcc/tree-vect-stmts.cc >> +++ b/gcc/tree-vect-stmts.cc >> @@ -10455,6 +10455,218 @@ vectorizable_load (vec_info *vinfo, >> return true; >> } >> >> + if (memory_access_type == VMAT_GATHER_SCATTER) >> + { >> + gcc_assert (alignment_support_scheme == dr_aligned >> + || alignment_support_scheme == dr_unaligned_supported); >> + gcc_assert (!grouped_load && !slp_perm); >> + >> + unsigned int inside_cost = 0, prologue_cost = 0; >> + for (j = 0; j < ncopies; j++) >> + { >> + /* 1. Create the vector or array pointer update chain. */ >> + if (j == 0 && !costing_p) >> + { >> + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) >> + vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info, >> + slp_node, &gs_info, &dataref_ptr, >> + &vec_offsets); >> + else >> + dataref_ptr >> + = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type, >> + at_loop, offset, &dummy, gsi, >> + &ptr_incr, false, bump); >> + } >> + else if (!costing_p) >> + { >> + gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)); >> + if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info)) >> + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, >> + gsi, stmt_info, bump); >> + } >> + >> + if (mask && !costing_p) >> + vec_mask = vec_masks[j]; >> + >> + gimple *new_stmt = NULL; >> + for (i = 0; i < vec_num; i++) >> + { >> + tree final_mask = NULL_TREE; >> + tree final_len = NULL_TREE; >> + tree bias = NULL_TREE; >> + if (!costing_p) >> + { >> + if (loop_masks) >> + final_mask >> + = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, >> + vec_num * ncopies, vectype, >> + vec_num * j + i); >> + if (vec_mask) >> + final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, >> + final_mask, vec_mask, gsi); >> + >> + if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) >> + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, >> + gsi, stmt_info, bump); >> + } >> + >> + /* 2. Create the vector-load in the loop. */ >> + unsigned HOST_WIDE_INT align; >> + if (gs_info.ifn != IFN_LAST) >> + { >> + if (costing_p) >> + { >> + unsigned int cnunits = vect_nunits_for_cost (vectype); >> + inside_cost >> + = record_stmt_cost (cost_vec, cnunits, scalar_load, >> + stmt_info, 0, vect_body); >> + continue; >> + } >> + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) >> + vec_offset = vec_offsets[vec_num * j + i]; >> + tree zero = build_zero_cst (vectype); >> + tree scale = size_int (gs_info.scale); >> + >> + if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD) >> + { >> + if (loop_lens) >> + final_len >> + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, >> + vec_num * ncopies, vectype, >> + vec_num * j + i, 1); >> + else >> + final_len >> + = build_int_cst (sizetype, >> + TYPE_VECTOR_SUBPARTS (vectype)); >> + signed char biasval >> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); >> + bias = build_int_cst (intQI_type_node, biasval); >> + if (!final_mask) >> + { >> + mask_vectype = truth_type_for (vectype); >> + final_mask = build_minus_one_cst (mask_vectype); >> + } >> + } >> + >> + gcall *call; >> + if (final_len && final_mask) >> + call >> + = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7, >> + dataref_ptr, vec_offset, >> + scale, zero, final_mask, >> + final_len, bias); >> + else if (final_mask) >> + call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5, >> + dataref_ptr, vec_offset, >> + scale, zero, final_mask); >> + else >> + call = gimple_build_call_internal (IFN_GATHER_LOAD, 4, >> + dataref_ptr, vec_offset, >> + scale, zero); >> + gimple_call_set_nothrow (call, true); >> + new_stmt = call; >> + data_ref = NULL_TREE; >> + } >> + else >> + { >> + /* Emulated gather-scatter. */ >> + gcc_assert (!final_mask); >> + unsigned HOST_WIDE_INT const_nunits = nunits.to_constant (); >> + if (costing_p) >> + { >> + /* For emulated gathers N offset vector element >> + offset add is consumed by the load). */ >> + inside_cost = record_stmt_cost (cost_vec, const_nunits, >> + vec_to_scalar, stmt_info, >> + 0, vect_body); >> + /* N scalar loads plus gathering them into a >> + vector. */ >> + inside_cost >> + = record_stmt_cost (cost_vec, const_nunits, scalar_load, >> + stmt_info, 0, vect_body); >> + inside_cost >> + = record_stmt_cost (cost_vec, 1, vec_construct, >> + stmt_info, 0, vect_body); >> + continue; >> + } >> + unsigned HOST_WIDE_INT const_offset_nunits >> + = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype) >> + .to_constant (); >> + vec<constructor_elt, va_gc> *ctor_elts; >> + vec_alloc (ctor_elts, const_nunits); >> + gimple_seq stmts = NULL; >> + /* We support offset vectors with more elements >> + than the data vector for now. */ >> + unsigned HOST_WIDE_INT factor >> + = const_offset_nunits / const_nunits; >> + vec_offset = vec_offsets[j / factor]; >> + unsigned elt_offset = (j % factor) * const_nunits; >> + tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset)); >> + tree scale = size_int (gs_info.scale); >> + align = get_object_alignment (DR_REF (first_dr_info->dr)); >> + tree ltype = build_aligned_type (TREE_TYPE (vectype), align); >> + for (unsigned k = 0; k < const_nunits; ++k) >> + { >> + tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type), >> + bitsize_int (k + elt_offset)); >> + tree idx >> + = gimple_build (&stmts, BIT_FIELD_REF, idx_type, >> + vec_offset, TYPE_SIZE (idx_type), boff); >> + idx = gimple_convert (&stmts, sizetype, idx); >> + idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx, >> + scale); >> + tree ptr = gimple_build (&stmts, PLUS_EXPR, >> + TREE_TYPE (dataref_ptr), >> + dataref_ptr, idx); >> + ptr = gimple_convert (&stmts, ptr_type_node, ptr); >> + tree elt = make_ssa_name (TREE_TYPE (vectype)); >> + tree ref = build2 (MEM_REF, ltype, ptr, >> + build_int_cst (ref_type, 0)); >> + new_stmt = gimple_build_assign (elt, ref); >> + gimple_set_vuse (new_stmt, gimple_vuse (gsi_stmt (*gsi))); >> + gimple_seq_add_stmt (&stmts, new_stmt); >> + CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt); >> + } >> + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); >> + new_stmt = gimple_build_assign ( >> + NULL_TREE, build_constructor (vectype, ctor_elts)); >> + data_ref = NULL_TREE; >> + } >> + >> + vec_dest = vect_create_destination_var (scalar_dest, vectype); >> + /* DATA_REF is null if we've already built the statement. */ >> + if (data_ref) >> + { >> + vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); >> + new_stmt = gimple_build_assign (vec_dest, data_ref); >> + } >> + new_temp = make_ssa_name (vec_dest, new_stmt); >> + gimple_set_lhs (new_stmt, new_temp); >> + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); >> + >> + /* Store vector loads in the corresponding SLP_NODE. */ >> + if (slp) >> + slp_node->push_vec_def (new_stmt); >> + } >> + >> + if (!slp && !costing_p) >> + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); >> + } >> + >> + if (!slp && !costing_p) >> + *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; >> + >> + if (costing_p) >> + { >> + if (dump_enabled_p ()) >> + dump_printf_loc (MSG_NOTE, vect_location, >> + "vect_model_load_cost: inside_cost = %u, " >> + "prologue_cost = %u .\n", >> + inside_cost, prologue_cost); >> + } >> + return true; >> + } >> + >> poly_uint64 group_elt = 0; >> unsigned int inside_cost = 0, prologue_cost = 0; >> for (j = 0; j < ncopies; j++) >> @@ -10504,12 +10716,6 @@ vectorizable_load (vec_info *vinfo, >> gcc_assert (!compute_in_loop); >> } >> } >> - else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) >> - { >> - vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info, >> - slp_node, &gs_info, &dataref_ptr, >> - &vec_offsets); >> - } >> else >> dataref_ptr >> = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type, >> @@ -10525,7 +10731,7 @@ vectorizable_load (vec_info *vinfo, >> if (dataref_offset) >> dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, >> bump); >> - else if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info)) >> + else >> dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, >> stmt_info, bump); >> if (mask) >> @@ -10551,7 +10757,7 @@ vectorizable_load (vec_info *vinfo, >> final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, >> final_mask, vec_mask, gsi); >> >> - if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) >> + if (i > 0) >> dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, >> gsi, stmt_info, bump); >> } >> @@ -10562,139 +10768,11 @@ vectorizable_load (vec_info *vinfo, >> case dr_aligned: >> case dr_unaligned_supported: >> { >> - unsigned int misalign; >> - unsigned HOST_WIDE_INT align; >> - >> - if (memory_access_type == VMAT_GATHER_SCATTER >> - && gs_info.ifn != IFN_LAST) >> - { >> - if (costing_p) >> - { >> - unsigned int cnunits = vect_nunits_for_cost (vectype); >> - inside_cost >> - = record_stmt_cost (cost_vec, cnunits, scalar_load, >> - stmt_info, 0, vect_body); >> - break; >> - } >> - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) >> - vec_offset = vec_offsets[vec_num * j + i]; >> - tree zero = build_zero_cst (vectype); >> - tree scale = size_int (gs_info.scale); >> - >> - if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD) >> - { >> - if (loop_lens) >> - final_len >> - = vect_get_loop_len (loop_vinfo, gsi, loop_lens, >> - vec_num * ncopies, vectype, >> - vec_num * j + i, 1); >> - else >> - final_len >> - = build_int_cst (sizetype, >> - TYPE_VECTOR_SUBPARTS (vectype)); >> - signed char biasval >> - = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); >> - bias = build_int_cst (intQI_type_node, biasval); >> - if (!final_mask) >> - { >> - mask_vectype = truth_type_for (vectype); >> - final_mask = build_minus_one_cst (mask_vectype); >> - } >> - } >> - >> - gcall *call; >> - if (final_len && final_mask) >> - call = gimple_build_call_internal ( >> - IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset, >> - scale, zero, final_mask, final_len, bias); >> - else if (final_mask) >> - call >> - = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5, >> - dataref_ptr, vec_offset, >> - scale, zero, final_mask); >> - else >> - call >> - = gimple_build_call_internal (IFN_GATHER_LOAD, 4, >> - dataref_ptr, vec_offset, >> - scale, zero); >> - gimple_call_set_nothrow (call, true); >> - new_stmt = call; >> - data_ref = NULL_TREE; >> - break; >> - } >> - else if (memory_access_type == VMAT_GATHER_SCATTER) >> - { >> - /* Emulated gather-scatter. */ >> - gcc_assert (!final_mask); >> - unsigned HOST_WIDE_INT const_nunits = nunits.to_constant (); >> - if (costing_p) >> - { >> - /* For emulated gathers N offset vector element >> - offset add is consumed by the load). */ >> - inside_cost >> - = record_stmt_cost (cost_vec, const_nunits, >> - vec_to_scalar, stmt_info, 0, >> - vect_body); >> - /* N scalar loads plus gathering them into a >> - vector. */ >> - inside_cost = record_stmt_cost (cost_vec, const_nunits, >> - scalar_load, stmt_info, >> - 0, vect_body); >> - inside_cost >> - = record_stmt_cost (cost_vec, 1, vec_construct, >> - stmt_info, 0, vect_body); >> - break; >> - } >> - unsigned HOST_WIDE_INT const_offset_nunits >> - = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype) >> - .to_constant (); >> - vec<constructor_elt, va_gc> *ctor_elts; >> - vec_alloc (ctor_elts, const_nunits); >> - gimple_seq stmts = NULL; >> - /* We support offset vectors with more elements >> - than the data vector for now. */ >> - unsigned HOST_WIDE_INT factor >> - = const_offset_nunits / const_nunits; >> - vec_offset = vec_offsets[j / factor]; >> - unsigned elt_offset = (j % factor) * const_nunits; >> - tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset)); >> - tree scale = size_int (gs_info.scale); >> - align = get_object_alignment (DR_REF (first_dr_info->dr)); >> - tree ltype >> - = build_aligned_type (TREE_TYPE (vectype), align); >> - for (unsigned k = 0; k < const_nunits; ++k) >> - { >> - tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type), >> - bitsize_int (k + elt_offset)); >> - tree idx = gimple_build (&stmts, BIT_FIELD_REF, >> - idx_type, vec_offset, >> - TYPE_SIZE (idx_type), boff); >> - idx = gimple_convert (&stmts, sizetype, idx); >> - idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx, >> - scale); >> - tree ptr = gimple_build (&stmts, PLUS_EXPR, >> - TREE_TYPE (dataref_ptr), >> - dataref_ptr, idx); >> - ptr = gimple_convert (&stmts, ptr_type_node, ptr); >> - tree elt = make_ssa_name (TREE_TYPE (vectype)); >> - tree ref = build2 (MEM_REF, ltype, ptr, >> - build_int_cst (ref_type, 0)); >> - new_stmt = gimple_build_assign (elt, ref); >> - gimple_set_vuse (new_stmt, >> - gimple_vuse (gsi_stmt (*gsi))); >> - gimple_seq_add_stmt (&stmts, new_stmt); >> - CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt); >> - } >> - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); >> - new_stmt = gimple_build_assign ( >> - NULL_TREE, build_constructor (vectype, ctor_elts)); >> - data_ref = NULL_TREE; >> - break; >> - } >> - >> if (costing_p) >> break; >> >> + unsigned int misalign; >> + unsigned HOST_WIDE_INT align; >> align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info)); >> if (alignment_support_scheme == dr_aligned) >> misalign = 0; >> @@ -11156,10 +11234,9 @@ vectorizable_load (vec_info *vinfo, >> >> if (costing_p) >> { >> - gcc_assert (memory_access_type != VMAT_INVARIANT >> - && memory_access_type != VMAT_ELEMENTWISE >> - && memory_access_type != VMAT_STRIDED_SLP >> - && memory_access_type != VMAT_LOAD_STORE_LANES); >> + gcc_assert (memory_access_type == VMAT_CONTIGUOUS >> + || memory_access_type == VMAT_CONTIGUOUS_REVERSE >> + || memory_access_type == VMAT_CONTIGUOUS_PERMUTE); >> if (dump_enabled_p ()) >> dump_printf_loc (MSG_NOTE, vect_location, >> "vect_model_load_cost: inside_cost = %u, " >> -- >> 2.39.1