On Mon, Aug 14, 2023 at 10:54 AM Kewen.Lin <li...@linux.ibm.com> wrote: > > Hi, > > Following Richi's suggestion [1], this patch is to move the > handlings on VMAT_LOAD_STORE_LANES in the final loop nest > of function vectorizable_load to its own loop. Basically > it duplicates the final loop nest, clean up some useless > set up code for the case of VMAT_LOAD_STORE_LANES, remove > some unreachable code. Also remove the corresponding > handlings in the final loop nest. > > Bootstrapped and regtested on x86_64-redhat-linux, > aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
OK (I guess the big diff is mostly because of re-indenting). Thanks, Richard. > [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html > > gcc/ChangeLog: > > * tree-vect-stmts.cc (vectorizable_load): Move the handlings on > VMAT_LOAD_STORE_LANES in the final loop nest to its own loop, > and update the final nest accordingly. > --- > gcc/tree-vect-stmts.cc | 1275 ++++++++++++++++++++-------------------- > 1 file changed, 634 insertions(+), 641 deletions(-) > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index 4f2d088484c..c361e16cb7b 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -10332,7 +10332,129 @@ vectorizable_load (vec_info *vinfo, > vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask, > &vec_masks, mask_vectype); > } > + > tree vec_mask = NULL_TREE; > + if (memory_access_type == VMAT_LOAD_STORE_LANES) > + { > + gcc_assert (alignment_support_scheme == dr_aligned > + || alignment_support_scheme == dr_unaligned_supported); > + gcc_assert (grouped_load && !slp); > + > + unsigned int inside_cost = 0, prologue_cost = 0; > + for (j = 0; j < ncopies; j++) > + { > + if (costing_p) > + { > + /* An IFN_LOAD_LANES will load all its vector results, > + regardless of which ones we actually need. Account > + for the cost of unused results. */ > + if (first_stmt_info == stmt_info) > + { > + unsigned int gaps = DR_GROUP_SIZE (first_stmt_info); > + stmt_vec_info next_stmt_info = first_stmt_info; > + do > + { > + gaps -= 1; > + next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); > + } > + while (next_stmt_info); > + if (gaps) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "vect_model_load_cost: %d " > + "unused vectors.\n", > + gaps); > + vect_get_load_cost (vinfo, stmt_info, gaps, > + alignment_support_scheme, > + misalignment, false, &inside_cost, > + &prologue_cost, cost_vec, cost_vec, > + true); > + } > + } > + vect_get_load_cost (vinfo, stmt_info, 1, > alignment_support_scheme, > + misalignment, false, &inside_cost, > + &prologue_cost, cost_vec, cost_vec, true); > + continue; > + } > + > + /* 1. Create the vector or array pointer update chain. */ > + if (j == 0) > + dataref_ptr > + = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type, > + at_loop, offset, &dummy, gsi, > + &ptr_incr, false, bump); > + else > + { > + gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)); > + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, > gsi, > + stmt_info, bump); > + } > + if (mask) > + vec_mask = vec_masks[j]; > + > + tree vec_array = create_vector_array (vectype, vec_num); > + > + tree final_mask = NULL_TREE; > + if (loop_masks) > + final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > + ncopies, vectype, j); > + if (vec_mask) > + final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, > final_mask, > + vec_mask, gsi); > + > + gcall *call; > + if (final_mask) > + { > + /* Emit: > + VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR, > + VEC_MASK). */ > + unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype)); > + tree alias_ptr = build_int_cst (ref_type, align); > + call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3, > + dataref_ptr, alias_ptr, > + final_mask); > + } > + else > + { > + /* Emit: > + VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]). */ > + data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type); > + call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref); > + } > + gimple_call_set_lhs (call, vec_array); > + gimple_call_set_nothrow (call, true); > + vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); > + > + dr_chain.create (vec_num); > + /* Extract each vector into an SSA_NAME. */ > + for (i = 0; i < vec_num; i++) > + { > + new_temp = read_vector_array (vinfo, stmt_info, gsi, > scalar_dest, > + vec_array, i); > + dr_chain.quick_push (new_temp); > + } > + > + /* Record the mapping between SSA_NAMEs and statements. */ > + vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain); > + > + /* Record that VEC_ARRAY is now dead. */ > + vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); > + > + dr_chain.release (); > + > + *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; > + } > + > + if (costing_p && dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "vect_model_load_cost: inside_cost = %u, " > + "prologue_cost = %u .\n", > + inside_cost, prologue_cost); > + > + return true; > + } > + > poly_uint64 group_elt = 0; > unsigned int inside_cost = 0, prologue_cost = 0; > for (j = 0; j < ncopies; j++) > @@ -10414,685 +10538,558 @@ vectorizable_load (vec_info *vinfo, > dr_chain.create (vec_num); > > gimple *new_stmt = NULL; > - if (memory_access_type == VMAT_LOAD_STORE_LANES) > + for (i = 0; i < vec_num; i++) > { > - if (costing_p) > - { > - /* An IFN_LOAD_LANES will load all its vector results, > - regardless of which ones we actually need. Account > - for the cost of unused results. */ > - if (grouped_load && first_stmt_info == stmt_info) > - { > - unsigned int gaps = DR_GROUP_SIZE (first_stmt_info); > - stmt_vec_info next_stmt_info = first_stmt_info; > - do > - { > - gaps -= 1; > - next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); > - } > - while (next_stmt_info); > - if (gaps) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_NOTE, vect_location, > - "vect_model_load_cost: %d " > - "unused vectors.\n", > - gaps); > - vect_get_load_cost (vinfo, stmt_info, gaps, > - alignment_support_scheme, > - misalignment, false, &inside_cost, > - &prologue_cost, cost_vec, cost_vec, > - true); > - } > - } > - vect_get_load_cost (vinfo, stmt_info, 1, > alignment_support_scheme, > - misalignment, false, &inside_cost, > - &prologue_cost, cost_vec, cost_vec, true); > - continue; > - } > - tree vec_array; > - > - vec_array = create_vector_array (vectype, vec_num); > - > tree final_mask = NULL_TREE; > - if (loop_masks) > - final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > - ncopies, vectype, j); > - if (vec_mask) > - final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, > - final_mask, vec_mask, gsi); > - > - gcall *call; > - if (final_mask) > - { > - /* Emit: > - VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR, > - VEC_MASK). */ > - unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype)); > - tree alias_ptr = build_int_cst (ref_type, align); > - call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3, > - dataref_ptr, alias_ptr, > - final_mask); > - } > - else > + tree final_len = NULL_TREE; > + tree bias = NULL_TREE; > + if (!costing_p) > { > - /* Emit: > - VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]). */ > - data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type); > - call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref); > - } > - gimple_call_set_lhs (call, vec_array); > - gimple_call_set_nothrow (call, true); > - vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); > - new_stmt = call; > + if (loop_masks) > + final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > + vec_num * ncopies, vectype, > + vec_num * j + i); > + if (vec_mask) > + final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, > + final_mask, vec_mask, gsi); > > - /* Extract each vector into an SSA_NAME. */ > - for (i = 0; i < vec_num; i++) > - { > - new_temp = read_vector_array (vinfo, stmt_info, gsi, > scalar_dest, > - vec_array, i); > - dr_chain.quick_push (new_temp); > + if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, > + gsi, stmt_info, bump); > } > > - /* Record the mapping between SSA_NAMEs and statements. */ > - vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain); > - > - /* Record that VEC_ARRAY is now dead. */ > - vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); > - } > - else > - { > - for (i = 0; i < vec_num; i++) > + /* 2. Create the vector-load in the loop. */ > + switch (alignment_support_scheme) > { > - tree final_mask = NULL_TREE; > - tree final_len = NULL_TREE; > - tree bias = NULL_TREE; > - if (!costing_p) > - { > - if (loop_masks) > - final_mask > - = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > - vec_num * ncopies, vectype, > - vec_num * j + i); > - if (vec_mask) > - final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, > - final_mask, vec_mask, gsi); > - > - if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > - dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, > ptr_incr, > - gsi, stmt_info, bump); > - } > + case dr_aligned: > + case dr_unaligned_supported: > + { > + unsigned int misalign; > + unsigned HOST_WIDE_INT align; > > - /* 2. Create the vector-load in the loop. */ > - switch (alignment_support_scheme) > - { > - case dr_aligned: > - case dr_unaligned_supported: > + if (memory_access_type == VMAT_GATHER_SCATTER > + && gs_info.ifn != IFN_LAST) > { > - unsigned int misalign; > - unsigned HOST_WIDE_INT align; > - > - if (memory_access_type == VMAT_GATHER_SCATTER > - && gs_info.ifn != IFN_LAST) > + if (costing_p) > { > - if (costing_p) > - { > - unsigned int cnunits > - = vect_nunits_for_cost (vectype); > - inside_cost > - = record_stmt_cost (cost_vec, cnunits, > - scalar_load, stmt_info, 0, > - vect_body); > - break; > - } > - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > - vec_offset = vec_offsets[vec_num * j + i]; > - tree zero = build_zero_cst (vectype); > - tree scale = size_int (gs_info.scale); > - > - if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD) > - { > - if (loop_lens) > - final_len > - = vect_get_loop_len (loop_vinfo, gsi, > loop_lens, > - vec_num * ncopies, > vectype, > - vec_num * j + i, 1); > - else > - final_len = build_int_cst (sizetype, > - TYPE_VECTOR_SUBPARTS > ( > - vectype)); > - signed char biasval > - = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS > (loop_vinfo); > - bias = build_int_cst (intQI_type_node, biasval); > - if (!final_mask) > - { > - mask_vectype = truth_type_for (vectype); > - final_mask = build_minus_one_cst > (mask_vectype); > - } > - } > - > - gcall *call; > - if (final_len && final_mask) > - call = gimple_build_call_internal ( > - IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, > - vec_offset, scale, zero, final_mask, final_len, > - bias); > - else if (final_mask) > - call = gimple_build_call_internal > - (IFN_MASK_GATHER_LOAD, 5, dataref_ptr, > - vec_offset, scale, zero, final_mask); > - else > - call = gimple_build_call_internal > - (IFN_GATHER_LOAD, 4, dataref_ptr, > - vec_offset, scale, zero); > - gimple_call_set_nothrow (call, true); > - new_stmt = call; > - data_ref = NULL_TREE; > + unsigned int cnunits = vect_nunits_for_cost (vectype); > + inside_cost > + = record_stmt_cost (cost_vec, cnunits, scalar_load, > + stmt_info, 0, vect_body); > break; > } > - else if (memory_access_type == VMAT_GATHER_SCATTER) > + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + vec_offset = vec_offsets[vec_num * j + i]; > + tree zero = build_zero_cst (vectype); > + tree scale = size_int (gs_info.scale); > + > + if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD) > { > - /* Emulated gather-scatter. */ > - gcc_assert (!final_mask); > - unsigned HOST_WIDE_INT const_nunits > - = nunits.to_constant (); > - if (costing_p) > - { > - /* For emulated gathers N offset vector element > - offset add is consumed by the load). */ > - inside_cost > - = record_stmt_cost (cost_vec, const_nunits, > - vec_to_scalar, stmt_info, 0, > - vect_body); > - /* N scalar loads plus gathering them into a > - vector. */ > - inside_cost > - = record_stmt_cost (cost_vec, const_nunits, > - scalar_load, stmt_info, 0, > - vect_body); > - inside_cost > - = record_stmt_cost (cost_vec, 1, vec_construct, > - stmt_info, 0, vect_body); > - break; > - } > - unsigned HOST_WIDE_INT const_offset_nunits > - = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype) > - .to_constant (); > - vec<constructor_elt, va_gc> *ctor_elts; > - vec_alloc (ctor_elts, const_nunits); > - gimple_seq stmts = NULL; > - /* We support offset vectors with more elements > - than the data vector for now. */ > - unsigned HOST_WIDE_INT factor > - = const_offset_nunits / const_nunits; > - vec_offset = vec_offsets[j / factor]; > - unsigned elt_offset = (j % factor) * const_nunits; > - tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset)); > - tree scale = size_int (gs_info.scale); > - align > - = get_object_alignment (DR_REF (first_dr_info->dr)); > - tree ltype = build_aligned_type (TREE_TYPE (vectype), > - align); > - for (unsigned k = 0; k < const_nunits; ++k) > + if (loop_lens) > + final_len > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, > + vec_num * ncopies, vectype, > + vec_num * j + i, 1); > + else > + final_len > + = build_int_cst (sizetype, > + TYPE_VECTOR_SUBPARTS (vectype)); > + signed char biasval > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > + bias = build_int_cst (intQI_type_node, biasval); > + if (!final_mask) > { > - tree boff = size_binop (MULT_EXPR, > - TYPE_SIZE (idx_type), > - bitsize_int > - (k + elt_offset)); > - tree idx = gimple_build (&stmts, BIT_FIELD_REF, > - idx_type, vec_offset, > - TYPE_SIZE (idx_type), > - boff); > - idx = gimple_convert (&stmts, sizetype, idx); > - idx = gimple_build (&stmts, MULT_EXPR, > - sizetype, idx, scale); > - tree ptr = gimple_build (&stmts, PLUS_EXPR, > - TREE_TYPE (dataref_ptr), > - dataref_ptr, idx); > - ptr = gimple_convert (&stmts, ptr_type_node, ptr); > - tree elt = make_ssa_name (TREE_TYPE (vectype)); > - tree ref = build2 (MEM_REF, ltype, ptr, > - build_int_cst (ref_type, 0)); > - new_stmt = gimple_build_assign (elt, ref); > - gimple_set_vuse (new_stmt, > - gimple_vuse (gsi_stmt (*gsi))); > - gimple_seq_add_stmt (&stmts, new_stmt); > - CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, > elt); > + mask_vectype = truth_type_for (vectype); > + final_mask = build_minus_one_cst (mask_vectype); > } > - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); > - new_stmt = gimple_build_assign (NULL_TREE, > - build_constructor > - (vectype, > ctor_elts)); > - data_ref = NULL_TREE; > - break; > } > > - if (costing_p) > - break; > - > - align = > - known_alignment (DR_TARGET_ALIGNMENT (first_dr_info)); > - if (alignment_support_scheme == dr_aligned) > - misalign = 0; > - else if (misalignment == DR_MISALIGNMENT_UNKNOWN) > - { > - align = dr_alignment > - (vect_dr_behavior (vinfo, first_dr_info)); > - misalign = 0; > - } > + gcall *call; > + if (final_len && final_mask) > + call = gimple_build_call_internal ( > + IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset, > + scale, zero, final_mask, final_len, bias); > + else if (final_mask) > + call > + = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5, > + dataref_ptr, vec_offset, > + scale, zero, > final_mask); > else > - misalign = misalignment; > - if (dataref_offset == NULL_TREE > - && TREE_CODE (dataref_ptr) == SSA_NAME) > - set_ptr_info_alignment (get_ptr_info (dataref_ptr), > - align, misalign); > - align = least_bit_hwi (misalign | align); > - > - /* Compute IFN when LOOP_LENS or final_mask valid. */ > - machine_mode vmode = TYPE_MODE (vectype); > - machine_mode new_vmode = vmode; > - internal_fn partial_ifn = IFN_LAST; > - if (loop_lens) > + call > + = gimple_build_call_internal (IFN_GATHER_LOAD, 4, > + dataref_ptr, vec_offset, > + scale, zero); > + gimple_call_set_nothrow (call, true); > + new_stmt = call; > + data_ref = NULL_TREE; > + break; > + } > + else if (memory_access_type == VMAT_GATHER_SCATTER) > + { > + /* Emulated gather-scatter. */ > + gcc_assert (!final_mask); > + unsigned HOST_WIDE_INT const_nunits = nunits.to_constant > (); > + if (costing_p) > { > - opt_machine_mode new_ovmode > - = get_len_load_store_mode (vmode, true, > - &partial_ifn); > - new_vmode = new_ovmode.require (); > - unsigned factor = (new_ovmode == vmode) > - ? 1 > - : GET_MODE_UNIT_SIZE (vmode); > - final_len > - = vect_get_loop_len (loop_vinfo, gsi, loop_lens, > - vec_num * ncopies, vectype, > - vec_num * j + i, factor); > + /* For emulated gathers N offset vector element > + offset add is consumed by the load). */ > + inside_cost > + = record_stmt_cost (cost_vec, const_nunits, > + vec_to_scalar, stmt_info, 0, > + vect_body); > + /* N scalar loads plus gathering them into a > + vector. */ > + inside_cost = record_stmt_cost (cost_vec, > const_nunits, > + scalar_load, > stmt_info, > + 0, vect_body); > + inside_cost > + = record_stmt_cost (cost_vec, 1, vec_construct, > + stmt_info, 0, vect_body); > + break; > } > - else if (final_mask) > + unsigned HOST_WIDE_INT const_offset_nunits > + = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype) > + .to_constant (); > + vec<constructor_elt, va_gc> *ctor_elts; > + vec_alloc (ctor_elts, const_nunits); > + gimple_seq stmts = NULL; > + /* We support offset vectors with more elements > + than the data vector for now. */ > + unsigned HOST_WIDE_INT factor > + = const_offset_nunits / const_nunits; > + vec_offset = vec_offsets[j / factor]; > + unsigned elt_offset = (j % factor) * const_nunits; > + tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset)); > + tree scale = size_int (gs_info.scale); > + align = get_object_alignment (DR_REF (first_dr_info->dr)); > + tree ltype > + = build_aligned_type (TREE_TYPE (vectype), align); > + for (unsigned k = 0; k < const_nunits; ++k) > { > - if (!can_vec_mask_load_store_p ( > - vmode, TYPE_MODE (TREE_TYPE (final_mask)), true, > - &partial_ifn)) > - gcc_unreachable (); > + tree boff = size_binop (MULT_EXPR, TYPE_SIZE > (idx_type), > + bitsize_int (k + elt_offset)); > + tree idx = gimple_build (&stmts, BIT_FIELD_REF, > + idx_type, vec_offset, > + TYPE_SIZE (idx_type), boff); > + idx = gimple_convert (&stmts, sizetype, idx); > + idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx, > + scale); > + tree ptr = gimple_build (&stmts, PLUS_EXPR, > + TREE_TYPE (dataref_ptr), > + dataref_ptr, idx); > + ptr = gimple_convert (&stmts, ptr_type_node, ptr); > + tree elt = make_ssa_name (TREE_TYPE (vectype)); > + tree ref = build2 (MEM_REF, ltype, ptr, > + build_int_cst (ref_type, 0)); > + new_stmt = gimple_build_assign (elt, ref); > + gimple_set_vuse (new_stmt, > + gimple_vuse (gsi_stmt (*gsi))); > + gimple_seq_add_stmt (&stmts, new_stmt); > + CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt); > } > + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); > + new_stmt = gimple_build_assign ( > + NULL_TREE, build_constructor (vectype, ctor_elts)); > + data_ref = NULL_TREE; > + break; > + } > > - if (partial_ifn == IFN_MASK_LEN_LOAD) > + if (costing_p) > + break; > + > + align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info)); > + if (alignment_support_scheme == dr_aligned) > + misalign = 0; > + else if (misalignment == DR_MISALIGNMENT_UNKNOWN) > + { > + align > + = dr_alignment (vect_dr_behavior (vinfo, > first_dr_info)); > + misalign = 0; > + } > + else > + misalign = misalignment; > + if (dataref_offset == NULL_TREE > + && TREE_CODE (dataref_ptr) == SSA_NAME) > + set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, > + misalign); > + align = least_bit_hwi (misalign | align); > + > + /* Compute IFN when LOOP_LENS or final_mask valid. */ > + machine_mode vmode = TYPE_MODE (vectype); > + machine_mode new_vmode = vmode; > + internal_fn partial_ifn = IFN_LAST; > + if (loop_lens) > + { > + opt_machine_mode new_ovmode > + = get_len_load_store_mode (vmode, true, &partial_ifn); > + new_vmode = new_ovmode.require (); > + unsigned factor > + = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE > (vmode); > + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens, > + vec_num * ncopies, vectype, > + vec_num * j + i, factor); > + } > + else if (final_mask) > + { > + if (!can_vec_mask_load_store_p ( > + vmode, TYPE_MODE (TREE_TYPE (final_mask)), true, > + &partial_ifn)) > + gcc_unreachable (); > + } > + > + if (partial_ifn == IFN_MASK_LEN_LOAD) > + { > + if (!final_len) > { > - if (!final_len) > - { > - /* Pass VF value to 'len' argument of > - MASK_LEN_LOAD if LOOP_LENS is invalid. */ > - final_len > - = size_int (TYPE_VECTOR_SUBPARTS (vectype)); > - } > - if (!final_mask) > - { > - /* Pass all ones value to 'mask' argument of > - MASK_LEN_LOAD if final_mask is invalid. */ > - mask_vectype = truth_type_for (vectype); > - final_mask = build_minus_one_cst (mask_vectype); > - } > + /* Pass VF value to 'len' argument of > + MASK_LEN_LOAD if LOOP_LENS is invalid. */ > + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype)); > } > - if (final_len) > + if (!final_mask) > { > - signed char biasval > - = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > - > - bias = build_int_cst (intQI_type_node, biasval); > + /* Pass all ones value to 'mask' argument of > + MASK_LEN_LOAD if final_mask is invalid. */ > + mask_vectype = truth_type_for (vectype); > + final_mask = build_minus_one_cst (mask_vectype); > } > + } > + if (final_len) > + { > + signed char biasval > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > > - if (final_len) > + bias = build_int_cst (intQI_type_node, biasval); > + } > + > + if (final_len) > + { > + tree ptr = build_int_cst (ref_type, align * > BITS_PER_UNIT); > + gcall *call; > + if (partial_ifn == IFN_MASK_LEN_LOAD) > + call = gimple_build_call_internal (IFN_MASK_LEN_LOAD, 5, > + dataref_ptr, ptr, > + final_mask, > final_len, > + bias); > + else > + call = gimple_build_call_internal (IFN_LEN_LOAD, 4, > + dataref_ptr, ptr, > + final_len, bias); > + gimple_call_set_nothrow (call, true); > + new_stmt = call; > + data_ref = NULL_TREE; > + > + /* Need conversion if it's wrapped with VnQI. */ > + if (vmode != new_vmode) > { > - tree ptr > - = build_int_cst (ref_type, align * BITS_PER_UNIT); > - gcall *call; > - if (partial_ifn == IFN_MASK_LEN_LOAD) > - call = gimple_build_call_internal > (IFN_MASK_LEN_LOAD, > - 5, dataref_ptr, > - ptr, final_mask, > - final_len, bias); > - else > - call = gimple_build_call_internal (IFN_LEN_LOAD, 4, > - dataref_ptr, ptr, > - final_len, bias); > - gimple_call_set_nothrow (call, true); > - new_stmt = call; > - data_ref = NULL_TREE; > - > - /* Need conversion if it's wrapped with VnQI. */ > - if (vmode != new_vmode) > - { > - tree new_vtype = build_vector_type_for_mode ( > - unsigned_intQI_type_node, new_vmode); > - tree var = vect_get_new_ssa_name (new_vtype, > - > vect_simple_var); > - gimple_set_lhs (call, var); > - vect_finish_stmt_generation (vinfo, stmt_info, > call, > - gsi); > - tree op = build1 (VIEW_CONVERT_EXPR, vectype, > var); > - new_stmt > - = gimple_build_assign (vec_dest, > - VIEW_CONVERT_EXPR, op); > - } > + tree new_vtype = build_vector_type_for_mode ( > + unsigned_intQI_type_node, new_vmode); > + tree var > + = vect_get_new_ssa_name (new_vtype, > vect_simple_var); > + gimple_set_lhs (call, var); > + vect_finish_stmt_generation (vinfo, stmt_info, call, > + gsi); > + tree op = build1 (VIEW_CONVERT_EXPR, vectype, var); > + new_stmt = gimple_build_assign (vec_dest, > + VIEW_CONVERT_EXPR, > op); > } > - else if (final_mask) > + } > + else if (final_mask) > + { > + tree ptr = build_int_cst (ref_type, align * > BITS_PER_UNIT); > + gcall *call = gimple_build_call_internal (IFN_MASK_LOAD, > 3, > + dataref_ptr, > ptr, > + final_mask); > + gimple_call_set_nothrow (call, true); > + new_stmt = call; > + data_ref = NULL_TREE; > + } > + else > + { > + tree ltype = vectype; > + tree new_vtype = NULL_TREE; > + unsigned HOST_WIDE_INT gap = DR_GROUP_GAP > (first_stmt_info); > + unsigned int vect_align > + = vect_known_alignment_in_bytes (first_dr_info, > vectype); > + unsigned int scalar_dr_size > + = vect_get_scalar_dr_size (first_dr_info); > + /* If there's no peeling for gaps but we have a gap > + with slp loads then load the lower half of the > + vector only. See get_group_load_store_type for > + when we apply this optimization. */ > + if (slp > + && loop_vinfo > + && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && gap > != 0 > + && known_eq (nunits, (group_size - gap) * 2) > + && known_eq (nunits, group_size) > + && gap >= (vect_align / scalar_dr_size)) > { > - tree ptr = build_int_cst (ref_type, > - align * BITS_PER_UNIT); > - gcall *call > - = gimple_build_call_internal (IFN_MASK_LOAD, 3, > - dataref_ptr, ptr, > - final_mask); > - gimple_call_set_nothrow (call, true); > - new_stmt = call; > - data_ref = NULL_TREE; > + tree half_vtype; > + new_vtype > + = vector_vector_composition_type (vectype, 2, > + &half_vtype); > + if (new_vtype != NULL_TREE) > + ltype = half_vtype; > } > + tree offset > + = (dataref_offset ? dataref_offset > + : build_int_cst (ref_type, 0)); > + if (ltype != vectype > + && memory_access_type == VMAT_CONTIGUOUS_REVERSE) > + { > + unsigned HOST_WIDE_INT gap_offset > + = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type)); > + tree gapcst = build_int_cst (ref_type, gap_offset); > + offset = size_binop (PLUS_EXPR, offset, gapcst); > + } > + data_ref > + = fold_build2 (MEM_REF, ltype, dataref_ptr, offset); > + if (alignment_support_scheme == dr_aligned) > + ; > else > + TREE_TYPE (data_ref) > + = build_aligned_type (TREE_TYPE (data_ref), > + align * BITS_PER_UNIT); > + if (ltype != vectype) > { > - tree ltype = vectype; > - tree new_vtype = NULL_TREE; > - unsigned HOST_WIDE_INT gap > - = DR_GROUP_GAP (first_stmt_info); > - unsigned int vect_align > - = vect_known_alignment_in_bytes (first_dr_info, > - vectype); > - unsigned int scalar_dr_size > - = vect_get_scalar_dr_size (first_dr_info); > - /* If there's no peeling for gaps but we have a gap > - with slp loads then load the lower half of the > - vector only. See get_group_load_store_type for > - when we apply this optimization. */ > - if (slp > - && loop_vinfo > - && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) > - && gap != 0 > - && known_eq (nunits, (group_size - gap) * 2) > - && known_eq (nunits, group_size) > - && gap >= (vect_align / scalar_dr_size)) > + vect_copy_ref_info (data_ref, > + DR_REF (first_dr_info->dr)); > + tree tem = make_ssa_name (ltype); > + new_stmt = gimple_build_assign (tem, data_ref); > + vect_finish_stmt_generation (vinfo, stmt_info, > new_stmt, > + gsi); > + data_ref = NULL; > + vec<constructor_elt, va_gc> *v; > + vec_alloc (v, 2); > + if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) > { > - tree half_vtype; > - new_vtype > - = vector_vector_composition_type (vectype, 2, > - &half_vtype); > - if (new_vtype != NULL_TREE) > - ltype = half_vtype; > + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, > + build_zero_cst (ltype)); > + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); > } > - tree offset > - = (dataref_offset ? dataref_offset > - : build_int_cst (ref_type, 0)); > - if (ltype != vectype > - && memory_access_type == VMAT_CONTIGUOUS_REVERSE) > + else > { > - unsigned HOST_WIDE_INT gap_offset > - = gap * tree_to_uhwi (TYPE_SIZE_UNIT > (elem_type)); > - tree gapcst = build_int_cst (ref_type, > gap_offset); > - offset = size_binop (PLUS_EXPR, offset, gapcst); > + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); > + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, > + build_zero_cst (ltype)); > } > - data_ref > - = fold_build2 (MEM_REF, ltype, dataref_ptr, offset); > - if (alignment_support_scheme == dr_aligned) > - ; > + gcc_assert (new_vtype != NULL_TREE); > + if (new_vtype == vectype) > + new_stmt = gimple_build_assign ( > + vec_dest, build_constructor (vectype, v)); > else > - TREE_TYPE (data_ref) > - = build_aligned_type (TREE_TYPE (data_ref), > - align * BITS_PER_UNIT); > - if (ltype != vectype) > { > - vect_copy_ref_info (data_ref, > - DR_REF (first_dr_info->dr)); > - tree tem = make_ssa_name (ltype); > - new_stmt = gimple_build_assign (tem, data_ref); > + tree new_vname = make_ssa_name (new_vtype); > + new_stmt = gimple_build_assign ( > + new_vname, build_constructor (new_vtype, v)); > vect_finish_stmt_generation (vinfo, stmt_info, > new_stmt, gsi); > - data_ref = NULL; > - vec<constructor_elt, va_gc> *v; > - vec_alloc (v, 2); > - if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) > - { > - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, > - build_zero_cst > (ltype)); > - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); > - } > - else > - { > - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); > - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, > - build_zero_cst > (ltype)); > - } > - gcc_assert (new_vtype != NULL_TREE); > - if (new_vtype == vectype) > - new_stmt = gimple_build_assign ( > - vec_dest, build_constructor (vectype, v)); > - else > - { > - tree new_vname = make_ssa_name (new_vtype); > - new_stmt = gimple_build_assign ( > - new_vname, build_constructor (new_vtype, > v)); > - vect_finish_stmt_generation (vinfo, stmt_info, > - new_stmt, gsi); > - new_stmt = gimple_build_assign ( > - vec_dest, build1 (VIEW_CONVERT_EXPR, > vectype, > - new_vname)); > - } > + new_stmt = gimple_build_assign ( > + vec_dest, > + build1 (VIEW_CONVERT_EXPR, vectype, new_vname)); > } > } > - break; > } > - case dr_explicit_realign: > - { > - if (costing_p) > - break; > - tree ptr, bump; > - > - tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype)); > + break; > + } > + case dr_explicit_realign: > + { > + if (costing_p) > + break; > + tree ptr, bump; > > - if (compute_in_loop) > - msq = vect_setup_realignment (vinfo, first_stmt_info, > gsi, > - &realignment_token, > - dr_explicit_realign, > - dataref_ptr, NULL); > + tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype)); > > - if (TREE_CODE (dataref_ptr) == SSA_NAME) > - ptr = copy_ssa_name (dataref_ptr); > - else > - ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); > - // For explicit realign the target alignment should be > - // known at compile time. > - unsigned HOST_WIDE_INT align = > - DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); > - new_stmt = gimple_build_assign > - (ptr, BIT_AND_EXPR, dataref_ptr, > - build_int_cst > - (TREE_TYPE (dataref_ptr), > - -(HOST_WIDE_INT) align)); > - vect_finish_stmt_generation (vinfo, stmt_info, > - new_stmt, gsi); > - data_ref > - = build2 (MEM_REF, vectype, ptr, > - build_int_cst (ref_type, 0)); > - vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); > - vec_dest = vect_create_destination_var (scalar_dest, > - vectype); > - new_stmt = gimple_build_assign (vec_dest, data_ref); > - new_temp = make_ssa_name (vec_dest, new_stmt); > - gimple_assign_set_lhs (new_stmt, new_temp); > - gimple_move_vops (new_stmt, stmt_info->stmt); > - vect_finish_stmt_generation (vinfo, stmt_info, > - new_stmt, gsi); > - msq = new_temp; > - > - bump = size_binop (MULT_EXPR, vs, > - TYPE_SIZE_UNIT (elem_type)); > - bump = size_binop (MINUS_EXPR, bump, size_one_node); > - ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, > - stmt_info, bump); > - new_stmt = gimple_build_assign > - (NULL_TREE, BIT_AND_EXPR, ptr, > - build_int_cst > - (TREE_TYPE (ptr), -(HOST_WIDE_INT) align)); > - if (TREE_CODE (ptr) == SSA_NAME) > - ptr = copy_ssa_name (ptr, new_stmt); > - else > - ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt); > - gimple_assign_set_lhs (new_stmt, ptr); > - vect_finish_stmt_generation (vinfo, stmt_info, > - new_stmt, gsi); > - data_ref > - = build2 (MEM_REF, vectype, ptr, > - build_int_cst (ref_type, 0)); > - break; > - } > - case dr_explicit_realign_optimized: > - { > - if (costing_p) > - break; > - if (TREE_CODE (dataref_ptr) == SSA_NAME) > - new_temp = copy_ssa_name (dataref_ptr); > - else > - new_temp = make_ssa_name (TREE_TYPE (dataref_ptr)); > - // We should only be doing this if we know the target > - // alignment at compile time. > - unsigned HOST_WIDE_INT align = > - DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); > - new_stmt = gimple_build_assign > - (new_temp, BIT_AND_EXPR, dataref_ptr, > - build_int_cst (TREE_TYPE (dataref_ptr), > - -(HOST_WIDE_INT) align)); > - vect_finish_stmt_generation (vinfo, stmt_info, > - new_stmt, gsi); > - data_ref > - = build2 (MEM_REF, vectype, new_temp, > - build_int_cst (ref_type, 0)); > - break; > - } > - default: > - gcc_unreachable (); > - } > + if (compute_in_loop) > + msq = vect_setup_realignment (vinfo, first_stmt_info, gsi, > + &realignment_token, > + dr_explicit_realign, > + dataref_ptr, NULL); > + > + if (TREE_CODE (dataref_ptr) == SSA_NAME) > + ptr = copy_ssa_name (dataref_ptr); > + else > + ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); > + // For explicit realign the target alignment should be > + // known at compile time. > + unsigned HOST_WIDE_INT align > + = DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); > + new_stmt = gimple_build_assign ( > + ptr, BIT_AND_EXPR, dataref_ptr, > + build_int_cst (TREE_TYPE (dataref_ptr), > + -(HOST_WIDE_INT) align)); > + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); > + data_ref > + = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, > 0)); > + vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); > + vec_dest = vect_create_destination_var (scalar_dest, vectype); > + new_stmt = gimple_build_assign (vec_dest, data_ref); > + new_temp = make_ssa_name (vec_dest, new_stmt); > + gimple_assign_set_lhs (new_stmt, new_temp); > + gimple_move_vops (new_stmt, stmt_info->stmt); > + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); > + msq = new_temp; > + > + bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT (elem_type)); > + bump = size_binop (MINUS_EXPR, bump, size_one_node); > + ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, > stmt_info, > + bump); > + new_stmt = gimple_build_assign ( > + NULL_TREE, BIT_AND_EXPR, ptr, > + build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align)); > + if (TREE_CODE (ptr) == SSA_NAME) > + ptr = copy_ssa_name (ptr, new_stmt); > + else > + ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt); > + gimple_assign_set_lhs (new_stmt, ptr); > + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); > + data_ref > + = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, > 0)); > + break; > + } > + case dr_explicit_realign_optimized: > + { > + if (costing_p) > + break; > + if (TREE_CODE (dataref_ptr) == SSA_NAME) > + new_temp = copy_ssa_name (dataref_ptr); > + else > + new_temp = make_ssa_name (TREE_TYPE (dataref_ptr)); > + // We should only be doing this if we know the target > + // alignment at compile time. > + unsigned HOST_WIDE_INT align > + = DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); > + new_stmt = gimple_build_assign ( > + new_temp, BIT_AND_EXPR, dataref_ptr, > + build_int_cst (TREE_TYPE (dataref_ptr), > + -(HOST_WIDE_INT) align)); > + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); > + data_ref = build2 (MEM_REF, vectype, new_temp, > + build_int_cst (ref_type, 0)); > + break; > + } > + default: > + gcc_unreachable (); > + } > > - /* One common place to cost the above vect load for different > - alignment support schemes. */ > - if (costing_p) > - { > - /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we > - only need to take care of the first stmt, whose > - stmt_info is first_stmt_info, vec_num iterating on it > - will cover the cost for the remaining, it's consistent > - with transforming. For the prologue cost for realign, > - we only need to count it once for the whole group. */ > - bool first_stmt_info_p = first_stmt_info == stmt_info; > - bool add_realign_cost = first_stmt_info_p && i == 0; > - if (memory_access_type == VMAT_CONTIGUOUS > - || memory_access_type == VMAT_CONTIGUOUS_REVERSE > - || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE > - && (!grouped_load || first_stmt_info_p))) > - vect_get_load_cost (vinfo, stmt_info, 1, > - alignment_support_scheme, > misalignment, > - add_realign_cost, &inside_cost, > - &prologue_cost, cost_vec, cost_vec, > - true); > - } > - else > + /* One common place to cost the above vect load for different > + alignment support schemes. */ > + if (costing_p) > + { > + /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we > + only need to take care of the first stmt, whose > + stmt_info is first_stmt_info, vec_num iterating on it > + will cover the cost for the remaining, it's consistent > + with transforming. For the prologue cost for realign, > + we only need to count it once for the whole group. */ > + bool first_stmt_info_p = first_stmt_info == stmt_info; > + bool add_realign_cost = first_stmt_info_p && i == 0; > + if (memory_access_type == VMAT_CONTIGUOUS > + || memory_access_type == VMAT_CONTIGUOUS_REVERSE > + || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE > + && (!grouped_load || first_stmt_info_p))) > + vect_get_load_cost (vinfo, stmt_info, 1, > + alignment_support_scheme, misalignment, > + add_realign_cost, &inside_cost, > + &prologue_cost, cost_vec, cost_vec, true); > + } > + else > + { > + vec_dest = vect_create_destination_var (scalar_dest, vectype); > + /* DATA_REF is null if we've already built the statement. */ > + if (data_ref) > { > - vec_dest = vect_create_destination_var (scalar_dest, > vectype); > - /* DATA_REF is null if we've already built the statement. > */ > - if (data_ref) > - { > - vect_copy_ref_info (data_ref, DR_REF > (first_dr_info->dr)); > - new_stmt = gimple_build_assign (vec_dest, data_ref); > - } > - new_temp = make_ssa_name (vec_dest, new_stmt); > - gimple_set_lhs (new_stmt, new_temp); > - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, > gsi); > + vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); > + new_stmt = gimple_build_assign (vec_dest, data_ref); > } > + new_temp = make_ssa_name (vec_dest, new_stmt); > + gimple_set_lhs (new_stmt, new_temp); > + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); > + } > > - /* 3. Handle explicit realignment if necessary/supported. > - Create in loop: > - vec_dest = realign_load (msq, lsq, realignment_token) */ > - if (!costing_p > - && (alignment_support_scheme == > dr_explicit_realign_optimized > - || alignment_support_scheme == dr_explicit_realign)) > - { > - lsq = gimple_assign_lhs (new_stmt); > - if (!realignment_token) > - realignment_token = dataref_ptr; > - vec_dest = vect_create_destination_var (scalar_dest, > vectype); > - new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, > - msq, lsq, > realignment_token); > - new_temp = make_ssa_name (vec_dest, new_stmt); > - gimple_assign_set_lhs (new_stmt, new_temp); > - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, > gsi); > + /* 3. Handle explicit realignment if necessary/supported. > + Create in loop: > + vec_dest = realign_load (msq, lsq, realignment_token) */ > + if (!costing_p > + && (alignment_support_scheme == dr_explicit_realign_optimized > + || alignment_support_scheme == dr_explicit_realign)) > + { > + lsq = gimple_assign_lhs (new_stmt); > + if (!realignment_token) > + realignment_token = dataref_ptr; > + vec_dest = vect_create_destination_var (scalar_dest, vectype); > + new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, > msq, > + lsq, realignment_token); > + new_temp = make_ssa_name (vec_dest, new_stmt); > + gimple_assign_set_lhs (new_stmt, new_temp); > + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); > > - if (alignment_support_scheme == > dr_explicit_realign_optimized) > - { > - gcc_assert (phi); > - if (i == vec_num - 1 && j == ncopies - 1) > - add_phi_arg (phi, lsq, > - loop_latch_edge (containing_loop), > - UNKNOWN_LOCATION); > - msq = lsq; > - } > + if (alignment_support_scheme == dr_explicit_realign_optimized) > + { > + gcc_assert (phi); > + if (i == vec_num - 1 && j == ncopies - 1) > + add_phi_arg (phi, lsq, loop_latch_edge (containing_loop), > + UNKNOWN_LOCATION); > + msq = lsq; > } > + } > > - if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) > + if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) > + { > + if (costing_p) > + inside_cost = record_stmt_cost (cost_vec, 1, vec_perm, > + stmt_info, 0, vect_body); > + else > { > - if (costing_p) > - inside_cost = record_stmt_cost (cost_vec, 1, vec_perm, > - stmt_info, 0, vect_body); > - else > - { > - tree perm_mask = perm_mask_for_reverse (vectype); > - new_temp > - = permute_vec_elements (vinfo, new_temp, new_temp, > - perm_mask, stmt_info, gsi); > - new_stmt = SSA_NAME_DEF_STMT (new_temp); > - } > + tree perm_mask = perm_mask_for_reverse (vectype); > + new_temp = permute_vec_elements (vinfo, new_temp, new_temp, > + perm_mask, stmt_info, gsi); > + new_stmt = SSA_NAME_DEF_STMT (new_temp); > } > + } > > - /* Collect vector loads and later create their permutation in > - vect_transform_grouped_load (). */ > - if (!costing_p && (grouped_load || slp_perm)) > - dr_chain.quick_push (new_temp); > + /* Collect vector loads and later create their permutation in > + vect_transform_grouped_load (). */ > + if (!costing_p && (grouped_load || slp_perm)) > + dr_chain.quick_push (new_temp); > > - /* Store vector loads in the corresponding SLP_NODE. */ > - if (!costing_p && slp && !slp_perm) > - slp_node->push_vec_def (new_stmt); > + /* Store vector loads in the corresponding SLP_NODE. */ > + if (!costing_p && slp && !slp_perm) > + slp_node->push_vec_def (new_stmt); > > - /* With SLP permutation we load the gaps as well, without > - we need to skip the gaps after we manage to fully load > - all elements. group_gap_adj is DR_GROUP_SIZE here. */ > - group_elt += nunits; > - if (!costing_p > - && maybe_ne (group_gap_adj, 0U) > - && !slp_perm > - && known_eq (group_elt, group_size - group_gap_adj)) > - { > - poly_wide_int bump_val > - = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) > - * group_gap_adj); > - if (tree_int_cst_sgn > - (vect_dr_behavior (vinfo, dr_info)->step) == -1) > - bump_val = -bump_val; > - tree bump = wide_int_to_tree (sizetype, bump_val); > - dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, > - gsi, stmt_info, bump); > - group_elt = 0; > - } > - } > - /* Bump the vector pointer to account for a gap or for excess > - elements loaded for a permuted SLP load. */ > + /* With SLP permutation we load the gaps as well, without > + we need to skip the gaps after we manage to fully load > + all elements. group_gap_adj is DR_GROUP_SIZE here. */ > + group_elt += nunits; > if (!costing_p > && maybe_ne (group_gap_adj, 0U) > - && slp_perm) > + && !slp_perm > + && known_eq (group_elt, group_size - group_gap_adj)) > { > poly_wide_int bump_val > - = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) > - * group_gap_adj); > - if (tree_int_cst_sgn > - (vect_dr_behavior (vinfo, dr_info)->step) == -1) > + = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj); > + if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) > + == -1) > bump_val = -bump_val; > tree bump = wide_int_to_tree (sizetype, bump_val); > dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, > gsi, > stmt_info, bump); > + group_elt = 0; > } > } > + /* Bump the vector pointer to account for a gap or for excess > + elements loaded for a permuted SLP load. */ > + if (!costing_p > + && maybe_ne (group_gap_adj, 0U) > + && slp_perm) > + { > + poly_wide_int bump_val > + = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj); > + if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) == > -1) > + bump_val = -bump_val; > + tree bump = wide_int_to_tree (sizetype, bump_val); > + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, > + stmt_info, bump); > + } > > if (slp && !slp_perm) > continue; > @@ -11120,39 +11117,36 @@ vectorizable_load (vec_info *vinfo, > } > } > else > - { > - if (grouped_load) > - { > - if (memory_access_type != VMAT_LOAD_STORE_LANES) > + { > + if (grouped_load) > + { > + gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE); > + /* We assume that the cost of a single load-lanes instruction > + is equivalent to the cost of DR_GROUP_SIZE separate loads. > + If a grouped access is instead being provided by a > + load-and-permute operation, include the cost of the > + permutes. */ > + if (costing_p && first_stmt_info == stmt_info) > { > - gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE); > - /* We assume that the cost of a single load-lanes > instruction > - is equivalent to the cost of DR_GROUP_SIZE separate > loads. > - If a grouped access is instead being provided by a > - load-and-permute operation, include the cost of the > - permutes. */ > - if (costing_p && first_stmt_info == stmt_info) > - { > - /* Uses an even and odd extract operations or shuffle > - operations for each needed permute. */ > - int group_size = DR_GROUP_SIZE (first_stmt_info); > - int nstmts = ceil_log2 (group_size) * group_size; > - inside_cost > - += record_stmt_cost (cost_vec, nstmts, vec_perm, > - stmt_info, 0, vect_body); > + /* Uses an even and odd extract operations or shuffle > + operations for each needed permute. */ > + int group_size = DR_GROUP_SIZE (first_stmt_info); > + int nstmts = ceil_log2 (group_size) * group_size; > + inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm, > + stmt_info, 0, vect_body); > > - if (dump_enabled_p ()) > - dump_printf_loc ( > - MSG_NOTE, vect_location, > - "vect_model_load_cost: strided group_size = %d .\n", > - group_size); > - } > - else if (!costing_p) > - vect_transform_grouped_load (vinfo, stmt_info, dr_chain, > - group_size, gsi); > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "vect_model_load_cost:" > + "strided group_size = %d .\n", > + group_size); > + } > + else if (!costing_p) > + { > + vect_transform_grouped_load (vinfo, stmt_info, dr_chain, > + group_size, gsi); > + *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; > } > - if (!costing_p) > - *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; > } > else if (!costing_p) > STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); > @@ -11166,7 +11160,8 @@ vectorizable_load (vec_info *vinfo, > { > gcc_assert (memory_access_type != VMAT_INVARIANT > && memory_access_type != VMAT_ELEMENTWISE > - && memory_access_type != VMAT_STRIDED_SLP); > + && memory_access_type != VMAT_STRIDED_SLP > + && memory_access_type != VMAT_LOAD_STORE_LANES); > if (dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, > "vect_model_load_cost: inside_cost = %u, " > -- > 2.31.1