On Tue, 4 Jul 2023, [email protected] wrote:
> From: Ju-Zhe Zhong <[email protected]>
>
> Hi, Richard and Richi.
>
> Address comments from Richi.
>
> Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE.
>
> I have fully tested these 4 format:
>
> length = vf is a dummpy length,
> mask = {-1,-1, ... } is a dummy mask.
>
> 1. no length, no mask
> LEN_MASK_GATHER_LOAD (..., length = vf, mask = {-1,-1,...})
> 2. exist length, no mask
> LEN_MASK_GATHER_LOAD (..., len, mask = {-1,-1,...})
> 3. exist mask, no length
> LEN_MASK_GATHER_LOAD (..., length = vf, mask)
> 4. both mask and length exist
> LEN_MASK_GATHER_LOAD (..., length, mask)
>
> All of these work fine in this patch.
>
> Here is the example:
>
> void
> f (int *restrict a,
> int *restrict b, int n,
> int base, int step,
> int *restrict cond)
> {
> for (int i = 0; i < n; ++i)
> {
> if (cond[i])
> a[i * 4] = b[i];
> }
> }
>
> Gimple IR:
>
> <bb 3> [local count: 105119324]:
> _58 = (unsigned long) n_13(D);
>
> <bb 4> [local count: 630715945]:
> # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)>
> # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)>
> # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)>
> # ivtmp_59 = PHI <ivtmp_60(4), _58(3)>
> _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
> ivtmp_44 = _61 * 4;
> vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
> mask__24.10_49 = vect__4.9_47 != { 0, ... };
> vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
> ivtmp_54 = _61 * 16;
> .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1,
> vect__8.13_53, _61, 0, mask__24.10_49);
> vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
> vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
> vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
> ivtmp_60 = ivtmp_59 - _61;
> if (ivtmp_60 != 0)
> goto <bb 4>; [83.33%]
> else
> goto <bb 5>; [16.67%]
>
> Ok for trunk ?
I think it matches the spirit of the existing code, thus OK.
Thanks,
Richard.
> gcc/ChangeLog:
>
> * internal-fn.cc (internal_fn_len_index): Apply
> LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
> (internal_fn_mask_index): Ditto.
> * optabs-query.cc (supports_vec_gather_load_p): Ditto.
> (supports_vec_scatter_store_p): Ditto.
> * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
> * tree-vect-patterns.cc (vect_recog_gather_scatter_pattern): Ditto.
> * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
> (vect_get_strided_load_store_ops): Ditto.
> (vectorizable_store): Ditto.
> (vectorizable_load): Ditto.
>
> ---
> gcc/internal-fn.cc | 6 +-
> gcc/optabs-query.cc | 2 +
> gcc/tree-vect-data-refs.cc | 18 +++++-
> gcc/tree-vect-patterns.cc | 4 +-
> gcc/tree-vect-stmts.cc | 122 +++++++++++++++++++++++++++++++------
> 5 files changed, 129 insertions(+), 23 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 303df102d81..bec60cdf4d0 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4472,7 +4472,7 @@ internal_fn_len_index (internal_fn fn)
>
> case IFN_LEN_MASK_GATHER_LOAD:
> case IFN_LEN_MASK_SCATTER_STORE:
> - return 4;
> + return 5;
>
> default:
> return -1;
> @@ -4497,11 +4497,9 @@ internal_fn_mask_index (internal_fn fn)
> case IFN_MASK_SCATTER_STORE:
> case IFN_LEN_MASK_LOAD:
> case IFN_LEN_MASK_STORE:
> - return 4;
> -
> case IFN_LEN_MASK_GATHER_LOAD:
> case IFN_LEN_MASK_SCATTER_STORE:
> - return 6;
> + return 4;
>
> default:
> return (conditional_internal_fn_code (fn) != ERROR_MARK
> diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
> index 2fdd0d34354..bf1f484e874 100644
> --- a/gcc/optabs-query.cc
> +++ b/gcc/optabs-query.cc
> @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
> this_fn_optabs->supports_vec_gather_load[mode]
> = (supports_vec_convert_optab_p (gather_load_optab, mode)
> || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
> + || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
> ? 1 : -1);
>
> return this_fn_optabs->supports_vec_gather_load[mode] > 0;
> @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
> this_fn_optabs->supports_vec_scatter_store[mode]
> = (supports_vec_convert_optab_p (scatter_store_optab, mode)
> || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
> + || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
> ? 1 : -1);
>
> return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index ebe93832b1e..ab2af103cb4 100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool
> read_p, bool masked_p,
> return false;
>
> /* Work out which function we need. */
> - internal_fn ifn, alt_ifn;
> + internal_fn ifn, alt_ifn, alt_ifn2;
> if (read_p)
> {
> ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
> alt_ifn = IFN_MASK_GATHER_LOAD;
> + /* When target supports LEN_MASK_GATHER_LOAD, we always
> + use LEN_MASK_GATHER_LOAD regardless whether len and
> + mask are valid or not. */
> + alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD;
> }
> else
> {
> ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
> alt_ifn = IFN_MASK_SCATTER_STORE;
> + /* When target supports LEN_MASK_SCATTER_STORE, we always
> + use LEN_MASK_SCATTER_STORE regardless whether len and
> + mask are valid or not. */
> + alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE;
> }
>
> for (;;)
> @@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool
> read_p, bool masked_p,
> *offset_vectype_out = offset_vectype;
> return true;
> }
> + else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
> + memory_type,
> + offset_vectype, scale))
> + {
> + *ifn_out = alt_ifn2;
> + *offset_vectype_out = offset_vectype;
> + return true;
> + }
>
> if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
> && TYPE_PRECISION (offset_type) >= element_bits)
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index de20e9d59cb..1bc36b043a0 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -6075,7 +6075,9 @@ vect_recog_gather_scatter_pattern (vec_info *vinfo,
> mask = vect_convert_mask_for_vectype (mask, gs_vectype, stmt_info,
> loop_vinfo);
> else if (gs_info.ifn == IFN_MASK_SCATTER_STORE
> - || gs_info.ifn == IFN_MASK_GATHER_LOAD)
> + || gs_info.ifn == IFN_MASK_GATHER_LOAD
> + || gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE
> + || gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD)
> mask = build_int_cst (TREE_TYPE (truth_type_for (gs_vectype)), -1);
>
> /* Get the invariant base and non-invariant offset, converting the
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index a0c39268bf0..09b51bf15fa 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info
> loop_vinfo, tree vectype,
> gs_info->offset_vectype,
> gs_info->scale))
> {
> + ifn = (is_load
> + ? IFN_LEN_MASK_GATHER_LOAD
> + : IFN_LEN_MASK_SCATTER_STORE);
> + if (internal_gather_scatter_fn_supported_p (ifn, vectype,
> + gs_info->memory_type,
> + gs_info->offset_vectype,
> + gs_info->scale))
> + {
> + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
> + return;
> + }
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> "can't operate on partial vectors because"
> @@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
> static void
> vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
> loop_vec_info loop_vinfo,
> + gimple_stmt_iterator *gsi,
> gather_scatter_info *gs_info,
> - tree *dataref_bump, tree *vec_offset)
> + tree *dataref_bump, tree *vec_offset,
> + vec_loop_lens *loop_lens)
> {
> struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
> tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>
> - tree bump = size_binop (MULT_EXPR,
> - fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> - size_int (TYPE_VECTOR_SUBPARTS (vectype)));
> - *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
> + if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
> + {
> + /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]);
> + ivtmp_8 = _31 * 16 (step in bytes);
> + .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... );
> + vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */
> + tree loop_len
> + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0);
> + tree tmp
> + = fold_build2 (MULT_EXPR, sizetype,
> + fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> + loop_len);
> + tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp");
> + gassign *assign = gimple_build_assign (bump, tmp);
> + gsi_insert_before (gsi, assign, GSI_SAME_STMT);
> + *dataref_bump = bump;
> + }
> + else
> + {
> + tree bump
> + = size_binop (MULT_EXPR,
> + fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> + size_int (TYPE_VECTOR_SUBPARTS (vectype)));
> + *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
> + }
>
> /* The offset given in GS_INFO can have pointer type, so use the element
> type of the vector instead. */
> @@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo,
> else if (memory_access_type == VMAT_GATHER_SCATTER)
> {
> aggr_type = elem_type;
> - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
> - &bump, &vec_offset);
> + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
> + &bump, &vec_offset, loop_lens);
> }
> else
> {
> @@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo,
> unsigned HOST_WIDE_INT align;
>
> tree final_mask = NULL_TREE;
> + tree final_len = NULL_TREE;
> + tree bias = NULL_TREE;
> if (loop_masks)
> final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> vec_num * ncopies,
> @@ -8929,8 +8966,36 @@ vectorizable_store (vec_info *vinfo,
> if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> vec_offset = vec_offsets[vec_num * j + i];
> tree scale = size_int (gs_info.scale);
> +
> + if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE)
> + {
> + if (loop_lens)
> + final_len
> + = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> + vec_num * ncopies, vectype,
> + vec_num * j + i, 1);
> + else
> + final_len
> + = build_int_cst (sizetype,
> + TYPE_VECTOR_SUBPARTS (vectype));
> + signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> + bias = build_int_cst (intQI_type_node, biasval);
> + if (!final_mask)
> + {
> + mask_vectype = truth_type_for (vectype);
> + final_mask = build_minus_one_cst (mask_vectype);
> + }
> + }
> +
> gcall *call;
> - if (final_mask)
> + if (final_len && final_mask)
> + call
> + = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE,
> + 7, dataref_ptr, vec_offset,
> + scale, vec_oprnd,
> final_mask,
> + final_len, bias);
> + else if (final_mask)
> call = gimple_build_call_internal
> (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
> scale, vec_oprnd, final_mask);
> @@ -9047,9 +9112,6 @@ vectorizable_store (vec_info *vinfo,
> machine_mode vmode = TYPE_MODE (vectype);
> machine_mode new_vmode = vmode;
> internal_fn partial_ifn = IFN_LAST;
> - /* Produce 'len' and 'bias' argument. */
> - tree final_len = NULL_TREE;
> - tree bias = NULL_TREE;
> if (loop_lens)
> {
> opt_machine_mode new_ovmode
> @@ -10177,8 +10239,8 @@ vectorizable_load (vec_info *vinfo,
> else if (memory_access_type == VMAT_GATHER_SCATTER)
> {
> aggr_type = elem_type;
> - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
> - &bump, &vec_offset);
> + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
> + &bump, &vec_offset, loop_lens);
> }
> else
> {
> @@ -10339,6 +10401,8 @@ vectorizable_load (vec_info *vinfo,
> for (i = 0; i < vec_num; i++)
> {
> tree final_mask = NULL_TREE;
> + tree final_len = NULL_TREE;
> + tree bias = NULL_TREE;
> if (loop_masks
> && memory_access_type != VMAT_INVARIANT)
> final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> @@ -10368,8 +10432,35 @@ vectorizable_load (vec_info *vinfo,
> vec_offset = vec_offsets[vec_num * j + i];
> tree zero = build_zero_cst (vectype);
> tree scale = size_int (gs_info.scale);
> +
> + if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD)
> + {
> + if (loop_lens)
> + final_len
> + = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> + vec_num * ncopies, vectype,
> + vec_num * j + i, 1);
> + else
> + final_len = build_int_cst (sizetype,
> + TYPE_VECTOR_SUBPARTS (
> + vectype));
> + signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> + bias = build_int_cst (intQI_type_node, biasval);
> + if (!final_mask)
> + {
> + mask_vectype = truth_type_for (vectype);
> + final_mask = build_minus_one_cst (mask_vectype);
> + }
> + }
> +
> gcall *call;
> - if (final_mask)
> + if (final_len && final_mask)
> + call = gimple_build_call_internal (
> + IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr,
> + vec_offset, scale, zero, final_mask, final_len,
> + bias);
> + else if (final_mask)
> call = gimple_build_call_internal
> (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
> vec_offset, scale, zero, final_mask);
> @@ -10462,9 +10553,6 @@ vectorizable_load (vec_info *vinfo,
> machine_mode vmode = TYPE_MODE (vectype);
> machine_mode new_vmode = vmode;
> internal_fn partial_ifn = IFN_LAST;
> - /* Produce 'len' and 'bias' argument. */
> - tree final_len = NULL_TREE;
> - tree bias = NULL_TREE;
> if (loop_lens)
> {
> opt_machine_mode new_ovmode
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)