Hi, Following Richi's suggestion [1], this patch is to move the handlings on VMAT_LOAD_STORE_LANES in the final loop nest of function vectorizable_load to its own loop. Basically it duplicates the final loop nest, clean up some useless set up code for the case of VMAT_LOAD_STORE_LANES, remove some unreachable code. Also remove the corresponding handlings in the final loop nest.
Bootstrapped and regtested on x86_64-redhat-linux, aarch64-linux-gnu and powerpc64{,le}-linux-gnu. [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html gcc/ChangeLog: * tree-vect-stmts.cc (vectorizable_load): Move the handlings on VMAT_LOAD_STORE_LANES in the final loop nest to its own loop, and update the final nest accordingly. --- gcc/tree-vect-stmts.cc | 1275 ++++++++++++++++++++-------------------- 1 file changed, 634 insertions(+), 641 deletions(-) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 4f2d088484c..c361e16cb7b 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -10332,7 +10332,129 @@ vectorizable_load (vec_info *vinfo, vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask, &vec_masks, mask_vectype); } + tree vec_mask = NULL_TREE; + if (memory_access_type == VMAT_LOAD_STORE_LANES) + { + gcc_assert (alignment_support_scheme == dr_aligned + || alignment_support_scheme == dr_unaligned_supported); + gcc_assert (grouped_load && !slp); + + unsigned int inside_cost = 0, prologue_cost = 0; + for (j = 0; j < ncopies; j++) + { + if (costing_p) + { + /* An IFN_LOAD_LANES will load all its vector results, + regardless of which ones we actually need. Account + for the cost of unused results. */ + if (first_stmt_info == stmt_info) + { + unsigned int gaps = DR_GROUP_SIZE (first_stmt_info); + stmt_vec_info next_stmt_info = first_stmt_info; + do + { + gaps -= 1; + next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); + } + while (next_stmt_info); + if (gaps) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vect_model_load_cost: %d " + "unused vectors.\n", + gaps); + vect_get_load_cost (vinfo, stmt_info, gaps, + alignment_support_scheme, + misalignment, false, &inside_cost, + &prologue_cost, cost_vec, cost_vec, + true); + } + } + vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme, + misalignment, false, &inside_cost, + &prologue_cost, cost_vec, cost_vec, true); + continue; + } + + /* 1. Create the vector or array pointer update chain. */ + if (j == 0) + dataref_ptr + = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type, + at_loop, offset, &dummy, gsi, + &ptr_incr, false, bump); + else + { + gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)); + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, + stmt_info, bump); + } + if (mask) + vec_mask = vec_masks[j]; + + tree vec_array = create_vector_array (vectype, vec_num); + + tree final_mask = NULL_TREE; + if (loop_masks) + final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, + ncopies, vectype, j); + if (vec_mask) + final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask, + vec_mask, gsi); + + gcall *call; + if (final_mask) + { + /* Emit: + VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR, + VEC_MASK). */ + unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype)); + tree alias_ptr = build_int_cst (ref_type, align); + call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3, + dataref_ptr, alias_ptr, + final_mask); + } + else + { + /* Emit: + VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]). */ + data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type); + call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref); + } + gimple_call_set_lhs (call, vec_array); + gimple_call_set_nothrow (call, true); + vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); + + dr_chain.create (vec_num); + /* Extract each vector into an SSA_NAME. */ + for (i = 0; i < vec_num; i++) + { + new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest, + vec_array, i); + dr_chain.quick_push (new_temp); + } + + /* Record the mapping between SSA_NAMEs and statements. */ + vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain); + + /* Record that VEC_ARRAY is now dead. */ + vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); + + dr_chain.release (); + + *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; + } + + if (costing_p && dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vect_model_load_cost: inside_cost = %u, " + "prologue_cost = %u .\n", + inside_cost, prologue_cost); + + return true; + } + poly_uint64 group_elt = 0; unsigned int inside_cost = 0, prologue_cost = 0; for (j = 0; j < ncopies; j++) @@ -10414,685 +10538,558 @@ vectorizable_load (vec_info *vinfo, dr_chain.create (vec_num); gimple *new_stmt = NULL; - if (memory_access_type == VMAT_LOAD_STORE_LANES) + for (i = 0; i < vec_num; i++) { - if (costing_p) - { - /* An IFN_LOAD_LANES will load all its vector results, - regardless of which ones we actually need. Account - for the cost of unused results. */ - if (grouped_load && first_stmt_info == stmt_info) - { - unsigned int gaps = DR_GROUP_SIZE (first_stmt_info); - stmt_vec_info next_stmt_info = first_stmt_info; - do - { - gaps -= 1; - next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); - } - while (next_stmt_info); - if (gaps) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "vect_model_load_cost: %d " - "unused vectors.\n", - gaps); - vect_get_load_cost (vinfo, stmt_info, gaps, - alignment_support_scheme, - misalignment, false, &inside_cost, - &prologue_cost, cost_vec, cost_vec, - true); - } - } - vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme, - misalignment, false, &inside_cost, - &prologue_cost, cost_vec, cost_vec, true); - continue; - } - tree vec_array; - - vec_array = create_vector_array (vectype, vec_num); - tree final_mask = NULL_TREE; - if (loop_masks) - final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, - ncopies, vectype, j); - if (vec_mask) - final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, - final_mask, vec_mask, gsi); - - gcall *call; - if (final_mask) - { - /* Emit: - VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR, - VEC_MASK). */ - unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype)); - tree alias_ptr = build_int_cst (ref_type, align); - call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3, - dataref_ptr, alias_ptr, - final_mask); - } - else + tree final_len = NULL_TREE; + tree bias = NULL_TREE; + if (!costing_p) { - /* Emit: - VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]). */ - data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type); - call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref); - } - gimple_call_set_lhs (call, vec_array); - gimple_call_set_nothrow (call, true); - vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); - new_stmt = call; + if (loop_masks) + final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, + vec_num * ncopies, vectype, + vec_num * j + i); + if (vec_mask) + final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, + final_mask, vec_mask, gsi); - /* Extract each vector into an SSA_NAME. */ - for (i = 0; i < vec_num; i++) - { - new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest, - vec_array, i); - dr_chain.quick_push (new_temp); + if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, + gsi, stmt_info, bump); } - /* Record the mapping between SSA_NAMEs and statements. */ - vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain); - - /* Record that VEC_ARRAY is now dead. */ - vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); - } - else - { - for (i = 0; i < vec_num; i++) + /* 2. Create the vector-load in the loop. */ + switch (alignment_support_scheme) { - tree final_mask = NULL_TREE; - tree final_len = NULL_TREE; - tree bias = NULL_TREE; - if (!costing_p) - { - if (loop_masks) - final_mask - = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, - vec_num * ncopies, vectype, - vec_num * j + i); - if (vec_mask) - final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, - final_mask, vec_mask, gsi); - - if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) - dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, - gsi, stmt_info, bump); - } + case dr_aligned: + case dr_unaligned_supported: + { + unsigned int misalign; + unsigned HOST_WIDE_INT align; - /* 2. Create the vector-load in the loop. */ - switch (alignment_support_scheme) - { - case dr_aligned: - case dr_unaligned_supported: + if (memory_access_type == VMAT_GATHER_SCATTER + && gs_info.ifn != IFN_LAST) { - unsigned int misalign; - unsigned HOST_WIDE_INT align; - - if (memory_access_type == VMAT_GATHER_SCATTER - && gs_info.ifn != IFN_LAST) + if (costing_p) { - if (costing_p) - { - unsigned int cnunits - = vect_nunits_for_cost (vectype); - inside_cost - = record_stmt_cost (cost_vec, cnunits, - scalar_load, stmt_info, 0, - vect_body); - break; - } - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) - vec_offset = vec_offsets[vec_num * j + i]; - tree zero = build_zero_cst (vectype); - tree scale = size_int (gs_info.scale); - - if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD) - { - if (loop_lens) - final_len - = vect_get_loop_len (loop_vinfo, gsi, loop_lens, - vec_num * ncopies, vectype, - vec_num * j + i, 1); - else - final_len = build_int_cst (sizetype, - TYPE_VECTOR_SUBPARTS ( - vectype)); - signed char biasval - = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); - bias = build_int_cst (intQI_type_node, biasval); - if (!final_mask) - { - mask_vectype = truth_type_for (vectype); - final_mask = build_minus_one_cst (mask_vectype); - } - } - - gcall *call; - if (final_len && final_mask) - call = gimple_build_call_internal ( - IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, - vec_offset, scale, zero, final_mask, final_len, - bias); - else if (final_mask) - call = gimple_build_call_internal - (IFN_MASK_GATHER_LOAD, 5, dataref_ptr, - vec_offset, scale, zero, final_mask); - else - call = gimple_build_call_internal - (IFN_GATHER_LOAD, 4, dataref_ptr, - vec_offset, scale, zero); - gimple_call_set_nothrow (call, true); - new_stmt = call; - data_ref = NULL_TREE; + unsigned int cnunits = vect_nunits_for_cost (vectype); + inside_cost + = record_stmt_cost (cost_vec, cnunits, scalar_load, + stmt_info, 0, vect_body); break; } - else if (memory_access_type == VMAT_GATHER_SCATTER) + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + vec_offset = vec_offsets[vec_num * j + i]; + tree zero = build_zero_cst (vectype); + tree scale = size_int (gs_info.scale); + + if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD) { - /* Emulated gather-scatter. */ - gcc_assert (!final_mask); - unsigned HOST_WIDE_INT const_nunits - = nunits.to_constant (); - if (costing_p) - { - /* For emulated gathers N offset vector element - offset add is consumed by the load). */ - inside_cost - = record_stmt_cost (cost_vec, const_nunits, - vec_to_scalar, stmt_info, 0, - vect_body); - /* N scalar loads plus gathering them into a - vector. */ - inside_cost - = record_stmt_cost (cost_vec, const_nunits, - scalar_load, stmt_info, 0, - vect_body); - inside_cost - = record_stmt_cost (cost_vec, 1, vec_construct, - stmt_info, 0, vect_body); - break; - } - unsigned HOST_WIDE_INT const_offset_nunits - = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype) - .to_constant (); - vec<constructor_elt, va_gc> *ctor_elts; - vec_alloc (ctor_elts, const_nunits); - gimple_seq stmts = NULL; - /* We support offset vectors with more elements - than the data vector for now. */ - unsigned HOST_WIDE_INT factor - = const_offset_nunits / const_nunits; - vec_offset = vec_offsets[j / factor]; - unsigned elt_offset = (j % factor) * const_nunits; - tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset)); - tree scale = size_int (gs_info.scale); - align - = get_object_alignment (DR_REF (first_dr_info->dr)); - tree ltype = build_aligned_type (TREE_TYPE (vectype), - align); - for (unsigned k = 0; k < const_nunits; ++k) + if (loop_lens) + final_len + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, + vec_num * ncopies, vectype, + vec_num * j + i, 1); + else + final_len + = build_int_cst (sizetype, + TYPE_VECTOR_SUBPARTS (vectype)); + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + bias = build_int_cst (intQI_type_node, biasval); + if (!final_mask) { - tree boff = size_binop (MULT_EXPR, - TYPE_SIZE (idx_type), - bitsize_int - (k + elt_offset)); - tree idx = gimple_build (&stmts, BIT_FIELD_REF, - idx_type, vec_offset, - TYPE_SIZE (idx_type), - boff); - idx = gimple_convert (&stmts, sizetype, idx); - idx = gimple_build (&stmts, MULT_EXPR, - sizetype, idx, scale); - tree ptr = gimple_build (&stmts, PLUS_EXPR, - TREE_TYPE (dataref_ptr), - dataref_ptr, idx); - ptr = gimple_convert (&stmts, ptr_type_node, ptr); - tree elt = make_ssa_name (TREE_TYPE (vectype)); - tree ref = build2 (MEM_REF, ltype, ptr, - build_int_cst (ref_type, 0)); - new_stmt = gimple_build_assign (elt, ref); - gimple_set_vuse (new_stmt, - gimple_vuse (gsi_stmt (*gsi))); - gimple_seq_add_stmt (&stmts, new_stmt); - CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt); + mask_vectype = truth_type_for (vectype); + final_mask = build_minus_one_cst (mask_vectype); } - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); - new_stmt = gimple_build_assign (NULL_TREE, - build_constructor - (vectype, ctor_elts)); - data_ref = NULL_TREE; - break; } - if (costing_p) - break; - - align = - known_alignment (DR_TARGET_ALIGNMENT (first_dr_info)); - if (alignment_support_scheme == dr_aligned) - misalign = 0; - else if (misalignment == DR_MISALIGNMENT_UNKNOWN) - { - align = dr_alignment - (vect_dr_behavior (vinfo, first_dr_info)); - misalign = 0; - } + gcall *call; + if (final_len && final_mask) + call = gimple_build_call_internal ( + IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset, + scale, zero, final_mask, final_len, bias); + else if (final_mask) + call + = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5, + dataref_ptr, vec_offset, + scale, zero, final_mask); else - misalign = misalignment; - if (dataref_offset == NULL_TREE - && TREE_CODE (dataref_ptr) == SSA_NAME) - set_ptr_info_alignment (get_ptr_info (dataref_ptr), - align, misalign); - align = least_bit_hwi (misalign | align); - - /* Compute IFN when LOOP_LENS or final_mask valid. */ - machine_mode vmode = TYPE_MODE (vectype); - machine_mode new_vmode = vmode; - internal_fn partial_ifn = IFN_LAST; - if (loop_lens) + call + = gimple_build_call_internal (IFN_GATHER_LOAD, 4, + dataref_ptr, vec_offset, + scale, zero); + gimple_call_set_nothrow (call, true); + new_stmt = call; + data_ref = NULL_TREE; + break; + } + else if (memory_access_type == VMAT_GATHER_SCATTER) + { + /* Emulated gather-scatter. */ + gcc_assert (!final_mask); + unsigned HOST_WIDE_INT const_nunits = nunits.to_constant (); + if (costing_p) { - opt_machine_mode new_ovmode - = get_len_load_store_mode (vmode, true, - &partial_ifn); - new_vmode = new_ovmode.require (); - unsigned factor = (new_ovmode == vmode) - ? 1 - : GET_MODE_UNIT_SIZE (vmode); - final_len - = vect_get_loop_len (loop_vinfo, gsi, loop_lens, - vec_num * ncopies, vectype, - vec_num * j + i, factor); + /* For emulated gathers N offset vector element + offset add is consumed by the load). */ + inside_cost + = record_stmt_cost (cost_vec, const_nunits, + vec_to_scalar, stmt_info, 0, + vect_body); + /* N scalar loads plus gathering them into a + vector. */ + inside_cost = record_stmt_cost (cost_vec, const_nunits, + scalar_load, stmt_info, + 0, vect_body); + inside_cost + = record_stmt_cost (cost_vec, 1, vec_construct, + stmt_info, 0, vect_body); + break; } - else if (final_mask) + unsigned HOST_WIDE_INT const_offset_nunits + = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype) + .to_constant (); + vec<constructor_elt, va_gc> *ctor_elts; + vec_alloc (ctor_elts, const_nunits); + gimple_seq stmts = NULL; + /* We support offset vectors with more elements + than the data vector for now. */ + unsigned HOST_WIDE_INT factor + = const_offset_nunits / const_nunits; + vec_offset = vec_offsets[j / factor]; + unsigned elt_offset = (j % factor) * const_nunits; + tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset)); + tree scale = size_int (gs_info.scale); + align = get_object_alignment (DR_REF (first_dr_info->dr)); + tree ltype + = build_aligned_type (TREE_TYPE (vectype), align); + for (unsigned k = 0; k < const_nunits; ++k) { - if (!can_vec_mask_load_store_p ( - vmode, TYPE_MODE (TREE_TYPE (final_mask)), true, - &partial_ifn)) - gcc_unreachable (); + tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type), + bitsize_int (k + elt_offset)); + tree idx = gimple_build (&stmts, BIT_FIELD_REF, + idx_type, vec_offset, + TYPE_SIZE (idx_type), boff); + idx = gimple_convert (&stmts, sizetype, idx); + idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx, + scale); + tree ptr = gimple_build (&stmts, PLUS_EXPR, + TREE_TYPE (dataref_ptr), + dataref_ptr, idx); + ptr = gimple_convert (&stmts, ptr_type_node, ptr); + tree elt = make_ssa_name (TREE_TYPE (vectype)); + tree ref = build2 (MEM_REF, ltype, ptr, + build_int_cst (ref_type, 0)); + new_stmt = gimple_build_assign (elt, ref); + gimple_set_vuse (new_stmt, + gimple_vuse (gsi_stmt (*gsi))); + gimple_seq_add_stmt (&stmts, new_stmt); + CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt); } + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + new_stmt = gimple_build_assign ( + NULL_TREE, build_constructor (vectype, ctor_elts)); + data_ref = NULL_TREE; + break; + } - if (partial_ifn == IFN_MASK_LEN_LOAD) + if (costing_p) + break; + + align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info)); + if (alignment_support_scheme == dr_aligned) + misalign = 0; + else if (misalignment == DR_MISALIGNMENT_UNKNOWN) + { + align + = dr_alignment (vect_dr_behavior (vinfo, first_dr_info)); + misalign = 0; + } + else + misalign = misalignment; + if (dataref_offset == NULL_TREE + && TREE_CODE (dataref_ptr) == SSA_NAME) + set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, + misalign); + align = least_bit_hwi (misalign | align); + + /* Compute IFN when LOOP_LENS or final_mask valid. */ + machine_mode vmode = TYPE_MODE (vectype); + machine_mode new_vmode = vmode; + internal_fn partial_ifn = IFN_LAST; + if (loop_lens) + { + opt_machine_mode new_ovmode + = get_len_load_store_mode (vmode, true, &partial_ifn); + new_vmode = new_ovmode.require (); + unsigned factor + = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode); + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens, + vec_num * ncopies, vectype, + vec_num * j + i, factor); + } + else if (final_mask) + { + if (!can_vec_mask_load_store_p ( + vmode, TYPE_MODE (TREE_TYPE (final_mask)), true, + &partial_ifn)) + gcc_unreachable (); + } + + if (partial_ifn == IFN_MASK_LEN_LOAD) + { + if (!final_len) { - if (!final_len) - { - /* Pass VF value to 'len' argument of - MASK_LEN_LOAD if LOOP_LENS is invalid. */ - final_len - = size_int (TYPE_VECTOR_SUBPARTS (vectype)); - } - if (!final_mask) - { - /* Pass all ones value to 'mask' argument of - MASK_LEN_LOAD if final_mask is invalid. */ - mask_vectype = truth_type_for (vectype); - final_mask = build_minus_one_cst (mask_vectype); - } + /* Pass VF value to 'len' argument of + MASK_LEN_LOAD if LOOP_LENS is invalid. */ + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype)); } - if (final_len) + if (!final_mask) { - signed char biasval - = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); - - bias = build_int_cst (intQI_type_node, biasval); + /* Pass all ones value to 'mask' argument of + MASK_LEN_LOAD if final_mask is invalid. */ + mask_vectype = truth_type_for (vectype); + final_mask = build_minus_one_cst (mask_vectype); } + } + if (final_len) + { + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); - if (final_len) + bias = build_int_cst (intQI_type_node, biasval); + } + + if (final_len) + { + tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT); + gcall *call; + if (partial_ifn == IFN_MASK_LEN_LOAD) + call = gimple_build_call_internal (IFN_MASK_LEN_LOAD, 5, + dataref_ptr, ptr, + final_mask, final_len, + bias); + else + call = gimple_build_call_internal (IFN_LEN_LOAD, 4, + dataref_ptr, ptr, + final_len, bias); + gimple_call_set_nothrow (call, true); + new_stmt = call; + data_ref = NULL_TREE; + + /* Need conversion if it's wrapped with VnQI. */ + if (vmode != new_vmode) { - tree ptr - = build_int_cst (ref_type, align * BITS_PER_UNIT); - gcall *call; - if (partial_ifn == IFN_MASK_LEN_LOAD) - call = gimple_build_call_internal (IFN_MASK_LEN_LOAD, - 5, dataref_ptr, - ptr, final_mask, - final_len, bias); - else - call = gimple_build_call_internal (IFN_LEN_LOAD, 4, - dataref_ptr, ptr, - final_len, bias); - gimple_call_set_nothrow (call, true); - new_stmt = call; - data_ref = NULL_TREE; - - /* Need conversion if it's wrapped with VnQI. */ - if (vmode != new_vmode) - { - tree new_vtype = build_vector_type_for_mode ( - unsigned_intQI_type_node, new_vmode); - tree var = vect_get_new_ssa_name (new_vtype, - vect_simple_var); - gimple_set_lhs (call, var); - vect_finish_stmt_generation (vinfo, stmt_info, call, - gsi); - tree op = build1 (VIEW_CONVERT_EXPR, vectype, var); - new_stmt - = gimple_build_assign (vec_dest, - VIEW_CONVERT_EXPR, op); - } + tree new_vtype = build_vector_type_for_mode ( + unsigned_intQI_type_node, new_vmode); + tree var + = vect_get_new_ssa_name (new_vtype, vect_simple_var); + gimple_set_lhs (call, var); + vect_finish_stmt_generation (vinfo, stmt_info, call, + gsi); + tree op = build1 (VIEW_CONVERT_EXPR, vectype, var); + new_stmt = gimple_build_assign (vec_dest, + VIEW_CONVERT_EXPR, op); } - else if (final_mask) + } + else if (final_mask) + { + tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT); + gcall *call = gimple_build_call_internal (IFN_MASK_LOAD, 3, + dataref_ptr, ptr, + final_mask); + gimple_call_set_nothrow (call, true); + new_stmt = call; + data_ref = NULL_TREE; + } + else + { + tree ltype = vectype; + tree new_vtype = NULL_TREE; + unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info); + unsigned int vect_align + = vect_known_alignment_in_bytes (first_dr_info, vectype); + unsigned int scalar_dr_size + = vect_get_scalar_dr_size (first_dr_info); + /* If there's no peeling for gaps but we have a gap + with slp loads then load the lower half of the + vector only. See get_group_load_store_type for + when we apply this optimization. */ + if (slp + && loop_vinfo + && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && gap != 0 + && known_eq (nunits, (group_size - gap) * 2) + && known_eq (nunits, group_size) + && gap >= (vect_align / scalar_dr_size)) { - tree ptr = build_int_cst (ref_type, - align * BITS_PER_UNIT); - gcall *call - = gimple_build_call_internal (IFN_MASK_LOAD, 3, - dataref_ptr, ptr, - final_mask); - gimple_call_set_nothrow (call, true); - new_stmt = call; - data_ref = NULL_TREE; + tree half_vtype; + new_vtype + = vector_vector_composition_type (vectype, 2, + &half_vtype); + if (new_vtype != NULL_TREE) + ltype = half_vtype; } + tree offset + = (dataref_offset ? dataref_offset + : build_int_cst (ref_type, 0)); + if (ltype != vectype + && memory_access_type == VMAT_CONTIGUOUS_REVERSE) + { + unsigned HOST_WIDE_INT gap_offset + = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type)); + tree gapcst = build_int_cst (ref_type, gap_offset); + offset = size_binop (PLUS_EXPR, offset, gapcst); + } + data_ref + = fold_build2 (MEM_REF, ltype, dataref_ptr, offset); + if (alignment_support_scheme == dr_aligned) + ; else + TREE_TYPE (data_ref) + = build_aligned_type (TREE_TYPE (data_ref), + align * BITS_PER_UNIT); + if (ltype != vectype) { - tree ltype = vectype; - tree new_vtype = NULL_TREE; - unsigned HOST_WIDE_INT gap - = DR_GROUP_GAP (first_stmt_info); - unsigned int vect_align - = vect_known_alignment_in_bytes (first_dr_info, - vectype); - unsigned int scalar_dr_size - = vect_get_scalar_dr_size (first_dr_info); - /* If there's no peeling for gaps but we have a gap - with slp loads then load the lower half of the - vector only. See get_group_load_store_type for - when we apply this optimization. */ - if (slp - && loop_vinfo - && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) - && gap != 0 - && known_eq (nunits, (group_size - gap) * 2) - && known_eq (nunits, group_size) - && gap >= (vect_align / scalar_dr_size)) + vect_copy_ref_info (data_ref, + DR_REF (first_dr_info->dr)); + tree tem = make_ssa_name (ltype); + new_stmt = gimple_build_assign (tem, data_ref); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, + gsi); + data_ref = NULL; + vec<constructor_elt, va_gc> *v; + vec_alloc (v, 2); + if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) { - tree half_vtype; - new_vtype - = vector_vector_composition_type (vectype, 2, - &half_vtype); - if (new_vtype != NULL_TREE) - ltype = half_vtype; + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, + build_zero_cst (ltype)); + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); } - tree offset - = (dataref_offset ? dataref_offset - : build_int_cst (ref_type, 0)); - if (ltype != vectype - && memory_access_type == VMAT_CONTIGUOUS_REVERSE) + else { - unsigned HOST_WIDE_INT gap_offset - = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type)); - tree gapcst = build_int_cst (ref_type, gap_offset); - offset = size_binop (PLUS_EXPR, offset, gapcst); + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, + build_zero_cst (ltype)); } - data_ref - = fold_build2 (MEM_REF, ltype, dataref_ptr, offset); - if (alignment_support_scheme == dr_aligned) - ; + gcc_assert (new_vtype != NULL_TREE); + if (new_vtype == vectype) + new_stmt = gimple_build_assign ( + vec_dest, build_constructor (vectype, v)); else - TREE_TYPE (data_ref) - = build_aligned_type (TREE_TYPE (data_ref), - align * BITS_PER_UNIT); - if (ltype != vectype) { - vect_copy_ref_info (data_ref, - DR_REF (first_dr_info->dr)); - tree tem = make_ssa_name (ltype); - new_stmt = gimple_build_assign (tem, data_ref); + tree new_vname = make_ssa_name (new_vtype); + new_stmt = gimple_build_assign ( + new_vname, build_constructor (new_vtype, v)); vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); - data_ref = NULL; - vec<constructor_elt, va_gc> *v; - vec_alloc (v, 2); - if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) - { - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, - build_zero_cst (ltype)); - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); - } - else - { - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, - build_zero_cst (ltype)); - } - gcc_assert (new_vtype != NULL_TREE); - if (new_vtype == vectype) - new_stmt = gimple_build_assign ( - vec_dest, build_constructor (vectype, v)); - else - { - tree new_vname = make_ssa_name (new_vtype); - new_stmt = gimple_build_assign ( - new_vname, build_constructor (new_vtype, v)); - vect_finish_stmt_generation (vinfo, stmt_info, - new_stmt, gsi); - new_stmt = gimple_build_assign ( - vec_dest, build1 (VIEW_CONVERT_EXPR, vectype, - new_vname)); - } + new_stmt = gimple_build_assign ( + vec_dest, + build1 (VIEW_CONVERT_EXPR, vectype, new_vname)); } } - break; } - case dr_explicit_realign: - { - if (costing_p) - break; - tree ptr, bump; - - tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype)); + break; + } + case dr_explicit_realign: + { + if (costing_p) + break; + tree ptr, bump; - if (compute_in_loop) - msq = vect_setup_realignment (vinfo, first_stmt_info, gsi, - &realignment_token, - dr_explicit_realign, - dataref_ptr, NULL); + tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype)); - if (TREE_CODE (dataref_ptr) == SSA_NAME) - ptr = copy_ssa_name (dataref_ptr); - else - ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); - // For explicit realign the target alignment should be - // known at compile time. - unsigned HOST_WIDE_INT align = - DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); - new_stmt = gimple_build_assign - (ptr, BIT_AND_EXPR, dataref_ptr, - build_int_cst - (TREE_TYPE (dataref_ptr), - -(HOST_WIDE_INT) align)); - vect_finish_stmt_generation (vinfo, stmt_info, - new_stmt, gsi); - data_ref - = build2 (MEM_REF, vectype, ptr, - build_int_cst (ref_type, 0)); - vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); - vec_dest = vect_create_destination_var (scalar_dest, - vectype); - new_stmt = gimple_build_assign (vec_dest, data_ref); - new_temp = make_ssa_name (vec_dest, new_stmt); - gimple_assign_set_lhs (new_stmt, new_temp); - gimple_move_vops (new_stmt, stmt_info->stmt); - vect_finish_stmt_generation (vinfo, stmt_info, - new_stmt, gsi); - msq = new_temp; - - bump = size_binop (MULT_EXPR, vs, - TYPE_SIZE_UNIT (elem_type)); - bump = size_binop (MINUS_EXPR, bump, size_one_node); - ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, - stmt_info, bump); - new_stmt = gimple_build_assign - (NULL_TREE, BIT_AND_EXPR, ptr, - build_int_cst - (TREE_TYPE (ptr), -(HOST_WIDE_INT) align)); - if (TREE_CODE (ptr) == SSA_NAME) - ptr = copy_ssa_name (ptr, new_stmt); - else - ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt); - gimple_assign_set_lhs (new_stmt, ptr); - vect_finish_stmt_generation (vinfo, stmt_info, - new_stmt, gsi); - data_ref - = build2 (MEM_REF, vectype, ptr, - build_int_cst (ref_type, 0)); - break; - } - case dr_explicit_realign_optimized: - { - if (costing_p) - break; - if (TREE_CODE (dataref_ptr) == SSA_NAME) - new_temp = copy_ssa_name (dataref_ptr); - else - new_temp = make_ssa_name (TREE_TYPE (dataref_ptr)); - // We should only be doing this if we know the target - // alignment at compile time. - unsigned HOST_WIDE_INT align = - DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); - new_stmt = gimple_build_assign - (new_temp, BIT_AND_EXPR, dataref_ptr, - build_int_cst (TREE_TYPE (dataref_ptr), - -(HOST_WIDE_INT) align)); - vect_finish_stmt_generation (vinfo, stmt_info, - new_stmt, gsi); - data_ref - = build2 (MEM_REF, vectype, new_temp, - build_int_cst (ref_type, 0)); - break; - } - default: - gcc_unreachable (); - } + if (compute_in_loop) + msq = vect_setup_realignment (vinfo, first_stmt_info, gsi, + &realignment_token, + dr_explicit_realign, + dataref_ptr, NULL); + + if (TREE_CODE (dataref_ptr) == SSA_NAME) + ptr = copy_ssa_name (dataref_ptr); + else + ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); + // For explicit realign the target alignment should be + // known at compile time. + unsigned HOST_WIDE_INT align + = DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); + new_stmt = gimple_build_assign ( + ptr, BIT_AND_EXPR, dataref_ptr, + build_int_cst (TREE_TYPE (dataref_ptr), + -(HOST_WIDE_INT) align)); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + data_ref + = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0)); + vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); + vec_dest = vect_create_destination_var (scalar_dest, vectype); + new_stmt = gimple_build_assign (vec_dest, data_ref); + new_temp = make_ssa_name (vec_dest, new_stmt); + gimple_assign_set_lhs (new_stmt, new_temp); + gimple_move_vops (new_stmt, stmt_info->stmt); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + msq = new_temp; + + bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT (elem_type)); + bump = size_binop (MINUS_EXPR, bump, size_one_node); + ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, stmt_info, + bump); + new_stmt = gimple_build_assign ( + NULL_TREE, BIT_AND_EXPR, ptr, + build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align)); + if (TREE_CODE (ptr) == SSA_NAME) + ptr = copy_ssa_name (ptr, new_stmt); + else + ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt); + gimple_assign_set_lhs (new_stmt, ptr); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + data_ref + = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0)); + break; + } + case dr_explicit_realign_optimized: + { + if (costing_p) + break; + if (TREE_CODE (dataref_ptr) == SSA_NAME) + new_temp = copy_ssa_name (dataref_ptr); + else + new_temp = make_ssa_name (TREE_TYPE (dataref_ptr)); + // We should only be doing this if we know the target + // alignment at compile time. + unsigned HOST_WIDE_INT align + = DR_TARGET_ALIGNMENT (first_dr_info).to_constant (); + new_stmt = gimple_build_assign ( + new_temp, BIT_AND_EXPR, dataref_ptr, + build_int_cst (TREE_TYPE (dataref_ptr), + -(HOST_WIDE_INT) align)); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + data_ref = build2 (MEM_REF, vectype, new_temp, + build_int_cst (ref_type, 0)); + break; + } + default: + gcc_unreachable (); + } - /* One common place to cost the above vect load for different - alignment support schemes. */ - if (costing_p) - { - /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we - only need to take care of the first stmt, whose - stmt_info is first_stmt_info, vec_num iterating on it - will cover the cost for the remaining, it's consistent - with transforming. For the prologue cost for realign, - we only need to count it once for the whole group. */ - bool first_stmt_info_p = first_stmt_info == stmt_info; - bool add_realign_cost = first_stmt_info_p && i == 0; - if (memory_access_type == VMAT_CONTIGUOUS - || memory_access_type == VMAT_CONTIGUOUS_REVERSE - || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE - && (!grouped_load || first_stmt_info_p))) - vect_get_load_cost (vinfo, stmt_info, 1, - alignment_support_scheme, misalignment, - add_realign_cost, &inside_cost, - &prologue_cost, cost_vec, cost_vec, - true); - } - else + /* One common place to cost the above vect load for different + alignment support schemes. */ + if (costing_p) + { + /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we + only need to take care of the first stmt, whose + stmt_info is first_stmt_info, vec_num iterating on it + will cover the cost for the remaining, it's consistent + with transforming. For the prologue cost for realign, + we only need to count it once for the whole group. */ + bool first_stmt_info_p = first_stmt_info == stmt_info; + bool add_realign_cost = first_stmt_info_p && i == 0; + if (memory_access_type == VMAT_CONTIGUOUS + || memory_access_type == VMAT_CONTIGUOUS_REVERSE + || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE + && (!grouped_load || first_stmt_info_p))) + vect_get_load_cost (vinfo, stmt_info, 1, + alignment_support_scheme, misalignment, + add_realign_cost, &inside_cost, + &prologue_cost, cost_vec, cost_vec, true); + } + else + { + vec_dest = vect_create_destination_var (scalar_dest, vectype); + /* DATA_REF is null if we've already built the statement. */ + if (data_ref) { - vec_dest = vect_create_destination_var (scalar_dest, vectype); - /* DATA_REF is null if we've already built the statement. */ - if (data_ref) - { - vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); - new_stmt = gimple_build_assign (vec_dest, data_ref); - } - new_temp = make_ssa_name (vec_dest, new_stmt); - gimple_set_lhs (new_stmt, new_temp); - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); + new_stmt = gimple_build_assign (vec_dest, data_ref); } + new_temp = make_ssa_name (vec_dest, new_stmt); + gimple_set_lhs (new_stmt, new_temp); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + } - /* 3. Handle explicit realignment if necessary/supported. - Create in loop: - vec_dest = realign_load (msq, lsq, realignment_token) */ - if (!costing_p - && (alignment_support_scheme == dr_explicit_realign_optimized - || alignment_support_scheme == dr_explicit_realign)) - { - lsq = gimple_assign_lhs (new_stmt); - if (!realignment_token) - realignment_token = dataref_ptr; - vec_dest = vect_create_destination_var (scalar_dest, vectype); - new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, - msq, lsq, realignment_token); - new_temp = make_ssa_name (vec_dest, new_stmt); - gimple_assign_set_lhs (new_stmt, new_temp); - vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + /* 3. Handle explicit realignment if necessary/supported. + Create in loop: + vec_dest = realign_load (msq, lsq, realignment_token) */ + if (!costing_p + && (alignment_support_scheme == dr_explicit_realign_optimized + || alignment_support_scheme == dr_explicit_realign)) + { + lsq = gimple_assign_lhs (new_stmt); + if (!realignment_token) + realignment_token = dataref_ptr; + vec_dest = vect_create_destination_var (scalar_dest, vectype); + new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, msq, + lsq, realignment_token); + new_temp = make_ssa_name (vec_dest, new_stmt); + gimple_assign_set_lhs (new_stmt, new_temp); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); - if (alignment_support_scheme == dr_explicit_realign_optimized) - { - gcc_assert (phi); - if (i == vec_num - 1 && j == ncopies - 1) - add_phi_arg (phi, lsq, - loop_latch_edge (containing_loop), - UNKNOWN_LOCATION); - msq = lsq; - } + if (alignment_support_scheme == dr_explicit_realign_optimized) + { + gcc_assert (phi); + if (i == vec_num - 1 && j == ncopies - 1) + add_phi_arg (phi, lsq, loop_latch_edge (containing_loop), + UNKNOWN_LOCATION); + msq = lsq; } + } - if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) + if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) + { + if (costing_p) + inside_cost = record_stmt_cost (cost_vec, 1, vec_perm, + stmt_info, 0, vect_body); + else { - if (costing_p) - inside_cost = record_stmt_cost (cost_vec, 1, vec_perm, - stmt_info, 0, vect_body); - else - { - tree perm_mask = perm_mask_for_reverse (vectype); - new_temp - = permute_vec_elements (vinfo, new_temp, new_temp, - perm_mask, stmt_info, gsi); - new_stmt = SSA_NAME_DEF_STMT (new_temp); - } + tree perm_mask = perm_mask_for_reverse (vectype); + new_temp = permute_vec_elements (vinfo, new_temp, new_temp, + perm_mask, stmt_info, gsi); + new_stmt = SSA_NAME_DEF_STMT (new_temp); } + } - /* Collect vector loads and later create their permutation in - vect_transform_grouped_load (). */ - if (!costing_p && (grouped_load || slp_perm)) - dr_chain.quick_push (new_temp); + /* Collect vector loads and later create their permutation in + vect_transform_grouped_load (). */ + if (!costing_p && (grouped_load || slp_perm)) + dr_chain.quick_push (new_temp); - /* Store vector loads in the corresponding SLP_NODE. */ - if (!costing_p && slp && !slp_perm) - slp_node->push_vec_def (new_stmt); + /* Store vector loads in the corresponding SLP_NODE. */ + if (!costing_p && slp && !slp_perm) + slp_node->push_vec_def (new_stmt); - /* With SLP permutation we load the gaps as well, without - we need to skip the gaps after we manage to fully load - all elements. group_gap_adj is DR_GROUP_SIZE here. */ - group_elt += nunits; - if (!costing_p - && maybe_ne (group_gap_adj, 0U) - && !slp_perm - && known_eq (group_elt, group_size - group_gap_adj)) - { - poly_wide_int bump_val - = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) - * group_gap_adj); - if (tree_int_cst_sgn - (vect_dr_behavior (vinfo, dr_info)->step) == -1) - bump_val = -bump_val; - tree bump = wide_int_to_tree (sizetype, bump_val); - dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, - gsi, stmt_info, bump); - group_elt = 0; - } - } - /* Bump the vector pointer to account for a gap or for excess - elements loaded for a permuted SLP load. */ + /* With SLP permutation we load the gaps as well, without + we need to skip the gaps after we manage to fully load + all elements. group_gap_adj is DR_GROUP_SIZE here. */ + group_elt += nunits; if (!costing_p && maybe_ne (group_gap_adj, 0U) - && slp_perm) + && !slp_perm + && known_eq (group_elt, group_size - group_gap_adj)) { poly_wide_int bump_val - = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) - * group_gap_adj); - if (tree_int_cst_sgn - (vect_dr_behavior (vinfo, dr_info)->step) == -1) + = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj); + if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) + == -1) bump_val = -bump_val; tree bump = wide_int_to_tree (sizetype, bump_val); dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, stmt_info, bump); + group_elt = 0; } } + /* Bump the vector pointer to account for a gap or for excess + elements loaded for a permuted SLP load. */ + if (!costing_p + && maybe_ne (group_gap_adj, 0U) + && slp_perm) + { + poly_wide_int bump_val + = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj); + if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) == -1) + bump_val = -bump_val; + tree bump = wide_int_to_tree (sizetype, bump_val); + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, + stmt_info, bump); + } if (slp && !slp_perm) continue; @@ -11120,39 +11117,36 @@ vectorizable_load (vec_info *vinfo, } } else - { - if (grouped_load) - { - if (memory_access_type != VMAT_LOAD_STORE_LANES) + { + if (grouped_load) + { + gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE); + /* We assume that the cost of a single load-lanes instruction + is equivalent to the cost of DR_GROUP_SIZE separate loads. + If a grouped access is instead being provided by a + load-and-permute operation, include the cost of the + permutes. */ + if (costing_p && first_stmt_info == stmt_info) { - gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE); - /* We assume that the cost of a single load-lanes instruction - is equivalent to the cost of DR_GROUP_SIZE separate loads. - If a grouped access is instead being provided by a - load-and-permute operation, include the cost of the - permutes. */ - if (costing_p && first_stmt_info == stmt_info) - { - /* Uses an even and odd extract operations or shuffle - operations for each needed permute. */ - int group_size = DR_GROUP_SIZE (first_stmt_info); - int nstmts = ceil_log2 (group_size) * group_size; - inside_cost - += record_stmt_cost (cost_vec, nstmts, vec_perm, - stmt_info, 0, vect_body); + /* Uses an even and odd extract operations or shuffle + operations for each needed permute. */ + int group_size = DR_GROUP_SIZE (first_stmt_info); + int nstmts = ceil_log2 (group_size) * group_size; + inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm, + stmt_info, 0, vect_body); - if (dump_enabled_p ()) - dump_printf_loc ( - MSG_NOTE, vect_location, - "vect_model_load_cost: strided group_size = %d .\n", - group_size); - } - else if (!costing_p) - vect_transform_grouped_load (vinfo, stmt_info, dr_chain, - group_size, gsi); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vect_model_load_cost:" + "strided group_size = %d .\n", + group_size); + } + else if (!costing_p) + { + vect_transform_grouped_load (vinfo, stmt_info, dr_chain, + group_size, gsi); + *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; } - if (!costing_p) - *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; } else if (!costing_p) STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); @@ -11166,7 +11160,8 @@ vectorizable_load (vec_info *vinfo, { gcc_assert (memory_access_type != VMAT_INVARIANT && memory_access_type != VMAT_ELEMENTWISE - && memory_access_type != VMAT_STRIDED_SLP); + && memory_access_type != VMAT_STRIDED_SLP + && memory_access_type != VMAT_LOAD_STORE_LANES); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "vect_model_load_cost: inside_cost = %u, " -- 2.31.1