Passed the x86 bootstrap and regression tests. Pan
-----Original Message----- From: Juzhe-Zhong <juzhe.zh...@rivai.ai> Sent: Tuesday, October 31, 2023 6:08 PM To: gcc-patches@gcc.gnu.org Cc: richard.sandif...@arm.com; rguent...@suse.de; jeffreya...@gmail.com; Juzhe-Zhong <juzhe.zh...@rivai.ai> Subject: [PATCH] VECT: Support mask_len_strided_load/mask_len_strided_store in loop vectorize This patch support loop vectorizer generate direct strided load/store IFN if targets enable it. Note that this patch provide the ability that target enabling strided load/store but without gather/scatter can vectorize stride memory access. gcc/ChangeLog: * optabs-query.cc (supports_vec_gather_load_p): Support strided load/store. (supports_vec_scatter_store_p): Ditto. * optabs-query.h (supports_vec_gather_load_p): Ditto. (supports_vec_scatter_store_p): Ditto. * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto. (vect_check_gather_scatter): Ditto. * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto. (vect_truncate_gather_scatter_offset): Ditto. (vect_use_strided_gather_scatters_p): Ditto. (vect_get_strided_load_store_ops): Ditto. (vectorizable_store): Ditto. (vectorizable_load): Ditto. * tree-vectorizer.h (vect_gather_scatter_fn_p): Ditto. --- gcc/optabs-query.cc | 27 ++++++++++----- gcc/optabs-query.h | 4 +-- gcc/tree-vect-data-refs.cc | 71 ++++++++++++++++++++++++++++---------- gcc/tree-vect-stmts.cc | 46 +++++++++++++++++------- gcc/tree-vectorizer.h | 3 +- 5 files changed, 109 insertions(+), 42 deletions(-) diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc index 947ccef218c..ea594baf15d 100644 --- a/gcc/optabs-query.cc +++ b/gcc/optabs-query.cc @@ -670,14 +670,19 @@ supports_vec_convert_optab_p (optab op, machine_mode mode) for at least one vector mode. */ bool -supports_vec_gather_load_p (machine_mode mode) +supports_vec_gather_load_p (machine_mode mode, bool strided_p) { if (!this_fn_optabs->supports_vec_gather_load[mode]) this_fn_optabs->supports_vec_gather_load[mode] = (supports_vec_convert_optab_p (gather_load_optab, mode) - || supports_vec_convert_optab_p (mask_gather_load_optab, mode) - || supports_vec_convert_optab_p (mask_len_gather_load_optab, mode) - ? 1 : -1); + || supports_vec_convert_optab_p (mask_gather_load_optab, mode) + || supports_vec_convert_optab_p (mask_len_gather_load_optab, mode) + || (strided_p + && convert_optab_handler (mask_len_strided_load_optab, mode, + Pmode) + != CODE_FOR_nothing) + ? 1 + : -1); return this_fn_optabs->supports_vec_gather_load[mode] > 0; } @@ -687,14 +692,20 @@ supports_vec_gather_load_p (machine_mode mode) for at least one vector mode. */ bool -supports_vec_scatter_store_p (machine_mode mode) +supports_vec_scatter_store_p (machine_mode mode, bool strided_p) { if (!this_fn_optabs->supports_vec_scatter_store[mode]) this_fn_optabs->supports_vec_scatter_store[mode] = (supports_vec_convert_optab_p (scatter_store_optab, mode) - || supports_vec_convert_optab_p (mask_scatter_store_optab, mode) - || supports_vec_convert_optab_p (mask_len_scatter_store_optab, mode) - ? 1 : -1); + || supports_vec_convert_optab_p (mask_scatter_store_optab, mode) + || supports_vec_convert_optab_p (mask_len_scatter_store_optab, + mode) + || (strided_p + && convert_optab_handler (mask_len_strided_store_optab, mode, + Pmode) + != CODE_FOR_nothing) + ? 1 + : -1); return this_fn_optabs->supports_vec_scatter_store[mode] > 0; } diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h index 920eb6a1b67..7c22edc5a78 100644 --- a/gcc/optabs-query.h +++ b/gcc/optabs-query.h @@ -191,8 +191,8 @@ bool can_compare_and_swap_p (machine_mode, bool); bool can_atomic_exchange_p (machine_mode, bool); bool can_atomic_load_p (machine_mode); bool lshift_cheap_p (bool); -bool supports_vec_gather_load_p (machine_mode = E_VOIDmode); -bool supports_vec_scatter_store_p (machine_mode = E_VOIDmode); +bool supports_vec_gather_load_p (machine_mode = E_VOIDmode, bool = false); +bool supports_vec_scatter_store_p (machine_mode = E_VOIDmode, bool = false); bool can_vec_extract (machine_mode, machine_mode); /* Version of find_widening_optab_handler_and_mode that operates on diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index d5c9c4a11c2..d374849b0a7 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -3913,9 +3913,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */ bool -vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, - tree vectype, tree memory_type, tree offset_type, - int scale, internal_fn *ifn_out, +vect_gather_scatter_fn_p (vec_info *vinfo, bool strided_p, bool read_p, + bool masked_p, tree vectype, tree memory_type, + tree offset_type, int scale, internal_fn *ifn_out, tree *offset_vectype_out) { unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type)); @@ -3926,7 +3926,7 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, return false; /* Work out which function we need. */ - internal_fn ifn, alt_ifn, alt_ifn2; + internal_fn ifn, alt_ifn, alt_ifn2, alt_ifn3; if (read_p) { ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; @@ -3935,6 +3935,12 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, use MASK_LEN_GATHER_LOAD regardless whether len and mask are valid or not. */ alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD; + /* When target supports MASK_LEN_STRIDED_LOAD, we can relax the + restrictions around the relationship of the vector offset type + to the loaded by using a gather load with strided access. + E.g. a "gather" of N bytes with a 64-bit stride would in principle + be possible without needing an Nx64-bit vector offset type. */ + alt_ifn3 = IFN_MASK_LEN_STRIDED_LOAD; } else { @@ -3944,6 +3950,12 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, use MASK_LEN_SCATTER_STORE regardless whether len and mask are valid or not. */ alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE; + /* When target supports MASK_LEN_STRIDED_STORE, we can relax the + restrictions around the relationship of the vector offset type + to the stored by using a scatter store with strided access. + E.g. a "scatter" of N bytes with a 64-bit stride would in principle + be possible without needing an Nx64-bit vector offset type. */ + alt_ifn3 = IFN_MASK_LEN_STRIDED_STORE; } for (;;) @@ -3953,8 +3965,20 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, return false; /* Test whether the target supports this combination. */ - if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type, - offset_vectype, scale)) + /* We don't need to check whether target supports gather/scatter IFN + with expected vector offset for gather/scatter with a strided access + when target itself support strided load/store IFN. */ + if (strided_p + && internal_strided_fn_supported_p (alt_ifn3, vectype, offset_type, + scale)) + { + *ifn_out = alt_ifn3; + *offset_vectype_out = offset_vectype; + return true; + } + else if (internal_gather_scatter_fn_supported_p (ifn, vectype, + memory_type, + offset_vectype, scale)) { *ifn_out = ifn; *offset_vectype_out = offset_vectype; @@ -4047,9 +4071,12 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, /* True if we should aim to use internal functions rather than built-in functions. */ - bool use_ifn_p = (DR_IS_READ (dr) - ? supports_vec_gather_load_p (TYPE_MODE (vectype)) - : supports_vec_scatter_store_p (TYPE_MODE (vectype))); + bool use_ifn_p + = (DR_IS_READ (dr) + ? supports_vec_gather_load_p (TYPE_MODE (vectype), + STMT_VINFO_STRIDED_P (stmt_info)) + : supports_vec_scatter_store_p (TYPE_MODE (vectype), + STMT_VINFO_STRIDED_P (stmt_info))); base = DR_REF (dr); /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, @@ -4196,13 +4223,17 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, /* Only treat this as a scaling operation if the target supports it for at least some offset type. */ if (use_ifn_p - && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), - masked_p, vectype, memory_type, + && !vect_gather_scatter_fn_p (loop_vinfo, + STMT_VINFO_STRIDED_P (stmt_info), + DR_IS_READ (dr), masked_p, + vectype, memory_type, signed_char_type_node, new_scale, &ifn, &offset_vectype) - && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), - masked_p, vectype, memory_type, + && !vect_gather_scatter_fn_p (loop_vinfo, + STMT_VINFO_STRIDED_P (stmt_info), + DR_IS_READ (dr), masked_p, + vectype, memory_type, unsigned_char_type_node, new_scale, &ifn, &offset_vectype)) @@ -4225,8 +4256,10 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, if (use_ifn_p && TREE_CODE (off) == SSA_NAME && !POINTER_TYPE_P (TREE_TYPE (off)) - && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), - masked_p, vectype, memory_type, + && vect_gather_scatter_fn_p (loop_vinfo, + STMT_VINFO_STRIDED_P (stmt_info), + DR_IS_READ (dr), masked_p, + vectype, memory_type, TREE_TYPE (off), scale, &ifn, &offset_vectype)) break; @@ -4280,9 +4313,11 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, if (use_ifn_p) { - if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p, - vectype, memory_type, offtype, scale, - &ifn, &offset_vectype)) + if (!vect_gather_scatter_fn_p (loop_vinfo, + STMT_VINFO_STRIDED_P (stmt_info), + DR_IS_READ (dr), masked_p, vectype, + memory_type, offtype, scale, &ifn, + &offset_vectype)) ifn = IFN_LAST; decl = NULL_TREE; } diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index a9200767f67..8ff06bd3acb 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -1506,10 +1506,15 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, internal_fn len_ifn = (is_load ? IFN_MASK_LEN_GATHER_LOAD : IFN_MASK_LEN_SCATTER_STORE); - if (internal_gather_scatter_fn_supported_p (len_ifn, vectype, - gs_info->memory_type, - gs_info->offset_vectype, - gs_info->scale)) + if (internal_strided_fn_p (gs_info->ifn) + && internal_strided_fn_supported_p (gs_info->ifn, vectype, + TREE_TYPE (gs_info->offset), + gs_info->scale)) + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1); + else if (internal_gather_scatter_fn_supported_p (len_ifn, vectype, + gs_info->memory_type, + gs_info->offset_vectype, + gs_info->scale)) vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1); else if (internal_gather_scatter_fn_supported_p (ifn, vectype, gs_info->memory_type, @@ -1693,8 +1698,10 @@ vect_truncate_gather_scatter_offset (stmt_vec_info stmt_info, /* See whether the target supports the operation with an offset no narrower than OFFSET_TYPE. */ tree memory_type = TREE_TYPE (DR_REF (dr)); - if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p, - vectype, memory_type, offset_type, scale, + if (!vect_gather_scatter_fn_p (loop_vinfo, + STMT_VINFO_STRIDED_P (stmt_info), + DR_IS_READ (dr), masked_p, vectype, + memory_type, offset_type, scale, &gs_info->ifn, &gs_info->offset_vectype) || gs_info->ifn == IFN_LAST) continue; @@ -1734,6 +1741,15 @@ vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info, || gs_info->ifn == IFN_LAST) return vect_truncate_gather_scatter_offset (stmt_info, loop_vinfo, masked_p, gs_info); + else if (internal_strided_fn_p (gs_info->ifn)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "using strided IFN for strided/grouped access," + " scale = %d\n", + gs_info->scale); + return true; + } tree old_offset_type = TREE_TYPE (gs_info->offset); tree new_offset_type = TREE_TYPE (gs_info->offset_vectype); @@ -3012,9 +3028,13 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info, ssize_int (gs_info->scale)); step = fold_convert (offset_type, step); - /* Create {0, X, X*2, X*3, ...}. */ - tree offset = fold_build2 (VEC_SERIES_EXPR, gs_info->offset_vectype, - build_zero_cst (offset_type), step); + tree offset; + if (internal_strided_fn_p (gs_info->ifn)) + offset = step; + else + /* Create {0, X, X*2, X*3, ...}. */ + offset = fold_build2 (VEC_SERIES_EXPR, gs_info->offset_vectype, + build_zero_cst (offset_type), step); *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo, offset); } @@ -9125,7 +9145,7 @@ vectorizable_store (vec_info *vinfo, vec_offset = vec_offsets[j]; tree scale = size_int (gs_info.scale); - if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE) + if (internal_fn_len_index (gs_info.ifn) >= 0) { if (loop_lens) final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens, @@ -9145,7 +9165,7 @@ vectorizable_store (vec_info *vinfo, gcall *call; if (final_len && final_mask) - call = gimple_build_call_internal (IFN_MASK_LEN_SCATTER_STORE, + call = gimple_build_call_internal (gs_info.ifn, 7, dataref_ptr, vec_offset, scale, vec_oprnd, final_mask, final_len, bias); @@ -10949,7 +10969,7 @@ vectorizable_load (vec_info *vinfo, tree zero = build_zero_cst (vectype); tree scale = size_int (gs_info.scale); - if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD) + if (internal_fn_len_index (gs_info.ifn) >= 0) { if (loop_lens) final_len @@ -10973,7 +10993,7 @@ vectorizable_load (vec_info *vinfo, gcall *call; if (final_len && final_mask) call - = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7, + = gimple_build_call_internal (gs_info.ifn, 7, dataref_ptr, vec_offset, scale, zero, final_mask, final_len, bias); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index a4043e4a656..76bf3aa14b4 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2309,7 +2309,8 @@ extern opt_result vect_analyze_data_refs_alignment (loop_vec_info); extern bool vect_slp_analyze_instance_alignment (vec_info *, slp_instance); extern opt_result vect_analyze_data_ref_accesses (vec_info *, vec<int> *); extern opt_result vect_prune_runtime_alias_test_list (loop_vec_info); -extern bool vect_gather_scatter_fn_p (vec_info *, bool, bool, tree, tree, +extern bool vect_gather_scatter_fn_p (vec_info *, + bool, bool, bool, tree, tree, tree, int, internal_fn *, tree *); extern bool vect_check_gather_scatter (stmt_vec_info, loop_vec_info, gather_scatter_info *); -- 2.36.3