Hi, Richard and Richi. Base on the suggestions from Richard: https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625396.html
This patch choose (1) approach that Richard provided, meaning: RVV implements cond_* optabs as expanders. RVV therefore supports both IFN_COND_ADD and IFN_COND_LEN_ADD. No dummy length arguments are needed at the gimple level. Such approach can make codes much cleaner and reasonable. Consider this following case: void foo (float * __restrict a, float * __restrict b, int * __restrict cond, int n) { for (int i = 0; i < n; i++) if (cond[i]) a[i] = b[i] + a[i]; } Output of RISC-V (32-bits) gcc (trunk) (Compiler #3) <source>:5:21: missed: couldn't vectorize loop <source>:5:21: missed: not vectorized: control flow in loop. ARM SVE: ... mask__27.10_51 = vect__4.9_49 != { 0, ... }; ... vec_mask_and_55 = loop_mask_49 & mask__27.10_51; ... vect__9.17_62 = .COND_ADD (vec_mask_and_55, vect__6.13_56, vect__8.16_60, vect__6.13_56); For RVV, we want IR as follows: ... _68 = .SELECT_VL (ivtmp_66, POLY_INT_CST [4, 4]); ... mask__27.10_51 = vect__4.9_49 != { 0, ... }; ... vect__9.17_60 = .COND_LEN_ADD (mask__27.10_51, vect__6.13_55, vect__8.16_59, vect__6.13_55, _68, 0); ... Both len and mask of COND_LEN_ADD are real not dummy. This patch has been fully tested in RISC-V port with supporting both COND_* and COND_LEN_*. And also, Bootstrap and Regression on X86 passed. OK for trunk? gcc/ChangeLog: * internal-fn.cc (FOR_EACH_LEN_FN_PAIR): New macro. (get_len_internal_fn): New function. (CASE): Ditto. * internal-fn.h (get_len_internal_fn): Ditto. * tree-vect-stmts.cc (vectorizable_call): Support CALL vectorization with COND_LEN_*. --- gcc/internal-fn.cc | 46 ++++++++++++++++++++++ gcc/internal-fn.h | 1 + gcc/tree-vect-stmts.cc | 87 +++++++++++++++++++++++++++++++++++++----- 3 files changed, 125 insertions(+), 9 deletions(-) diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index 8e294286388..379220bebc7 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -4443,6 +4443,52 @@ get_conditional_internal_fn (internal_fn fn) } } +/* Invoke T(IFN) for each internal function IFN that also has an + IFN_COND_LEN_* or IFN_MASK_LEN_* form. */ +#define FOR_EACH_LEN_FN_PAIR(T) \ + T (MASK_LOAD, MASK_LEN_LOAD) \ + T (MASK_STORE, MASK_LEN_STORE) \ + T (MASK_GATHER_LOAD, MASK_LEN_GATHER_LOAD) \ + T (MASK_SCATTER_STORE, MASK_LEN_SCATTER_STORE) \ + T (COND_ADD, COND_LEN_ADD) \ + T (COND_SUB, COND_LEN_SUB) \ + T (COND_MUL, COND_LEN_MUL) \ + T (COND_DIV, COND_LEN_DIV) \ + T (COND_MOD, COND_LEN_MOD) \ + T (COND_RDIV, COND_LEN_RDIV) \ + T (COND_FMIN, COND_LEN_FMIN) \ + T (COND_FMAX, COND_LEN_FMAX) \ + T (COND_MIN, COND_LEN_MIN) \ + T (COND_MAX, COND_LEN_MAX) \ + T (COND_AND, COND_LEN_AND) \ + T (COND_IOR, COND_LEN_IOR) \ + T (COND_XOR, COND_LEN_XOR) \ + T (COND_SHL, COND_LEN_SHL) \ + T (COND_SHR, COND_LEN_SHR) \ + T (COND_NEG, COND_LEN_NEG) \ + T (COND_FMA, COND_LEN_FMA) \ + T (COND_FMS, COND_LEN_FMS) \ + T (COND_FNMA, COND_LEN_FNMA) \ + T (COND_FNMS, COND_LEN_FNMS) + +/* If there exists an internal function like IFN that operates on vectors, + but with additional length and bias parameters, return the internal_fn + for that function, otherwise return IFN_LAST. */ +internal_fn +get_len_internal_fn (internal_fn fn) +{ + switch (fn) + { +#define CASE(NAME, LEN_NAME) \ + case IFN_##NAME: \ + return IFN_##LEN_NAME; + FOR_EACH_LEN_FN_PAIR (CASE) +#undef CASE + default: + return IFN_LAST; + } +} + /* If IFN implements the conditional form of an unconditional internal function, return that unconditional function, otherwise return IFN_LAST. */ diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h index a5c3f4765ff..410c1b623d6 100644 --- a/gcc/internal-fn.h +++ b/gcc/internal-fn.h @@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void); extern internal_fn get_conditional_internal_fn (tree_code); extern internal_fn get_conditional_internal_fn (internal_fn); +extern internal_fn get_len_internal_fn (internal_fn); extern internal_fn get_conditional_len_internal_fn (tree_code); extern tree_code conditional_internal_fn_code (internal_fn); extern internal_fn get_unconditional_internal_fn (internal_fn); diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 6a4e8fce126..ae5b0b09c08 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -3540,7 +3540,10 @@ vectorizable_call (vec_info *vinfo, int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info); internal_fn cond_fn = get_conditional_internal_fn (ifn); + internal_fn cond_len_fn = get_len_internal_fn (ifn); + int len_opno = internal_fn_len_index (cond_len_fn); vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL); + vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL); if (!vec_stmt) /* transformation not required. */ { if (slp_node) @@ -3586,8 +3589,14 @@ vectorizable_call (vec_info *vinfo, tree scalar_mask = NULL_TREE; if (mask_opno >= 0) scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno); - vect_record_loop_mask (loop_vinfo, masks, nvectors, - vectype_out, scalar_mask); + if (cond_len_fn != IFN_LAST + && direct_internal_fn_supported_p (cond_len_fn, vectype_out, + OPTIMIZE_FOR_SPEED)) + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_out, + 1); + else + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out, + scalar_mask); } } return true; @@ -3603,8 +3612,24 @@ vectorizable_call (vec_info *vinfo, vec_dest = vect_create_destination_var (scalar_dest, vectype_out); bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); + bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo); unsigned int vect_nargs = nargs; - if (masked_loop_p && reduc_idx >= 0) + if (len_loop_p) + { + if (len_opno >= 0) + { + ifn = cond_len_fn; + /* COND_* -> COND_LEN_* takes 2 extra arguments:LEN,BIAS. */ + vect_nargs += 2; + } + else if (reduc_idx >= 0) + { + /* FMA -> COND_LEN_FMA takes 4 extra arguments:MASK,ELSE,LEN,BIAS. */ + ifn = get_len_internal_fn (cond_fn); + vect_nargs += 4; + } + } + else if (masked_loop_p && reduc_idx >= 0) { ifn = cond_fn; vect_nargs += 2; @@ -3629,7 +3654,18 @@ vectorizable_call (vec_info *vinfo, FOR_EACH_VEC_ELT (vec_oprnds0, i, vec_oprnd0) { int varg = 0; - if (masked_loop_p && reduc_idx >= 0) + if (len_loop_p && reduc_idx >= 0) + { + /* Always true for SLP. */ + gcc_assert (ncopies == 1); + /* For COND_LEN_* operations used by reduction of + CALL vectorization, the LEN argument is the real + loop len produced by SELECT_VL or MIN wheras the + MASK argument here is the dummy mask. */ + vargs[varg++] + = build_minus_one_cst (truth_type_for (vectype_out)); + } + else if (masked_loop_p && reduc_idx >= 0) { unsigned int vec_num = vec_oprnds0.length (); /* Always true for SLP. */ @@ -3644,7 +3680,7 @@ vectorizable_call (vec_info *vinfo, vec<tree> vec_oprndsk = vec_defs[k]; vargs[varg++] = vec_oprndsk[i]; } - if (masked_loop_p && reduc_idx >= 0) + if ((masked_loop_p || len_loop_p) && reduc_idx >= 0) vargs[varg++] = vargs[reduc_idx + 1]; gimple *new_stmt; if (modifier == NARROW) @@ -3671,7 +3707,21 @@ vectorizable_call (vec_info *vinfo, } else { - if (mask_opno >= 0 && masked_loop_p) + if (len_opno >= 0 && len_loop_p) + { + unsigned int vec_num = vec_oprnds0.length (); + /* Always true for SLP. */ + gcc_assert (ncopies == 1); + tree len + = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, + vectype_out, i, 1); + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + tree bias = build_int_cst (intQI_type_node, biasval); + vargs[len_opno] = len; + vargs[len_opno + 1] = bias; + } + else if (mask_opno >= 0 && masked_loop_p) { unsigned int vec_num = vec_oprnds0.length (); /* Always true for SLP. */ @@ -3701,7 +3751,16 @@ vectorizable_call (vec_info *vinfo, } int varg = 0; - if (masked_loop_p && reduc_idx >= 0) + if (len_loop_p && reduc_idx >= 0) + { + /* For COND_LEN_* operations used by reduction of + CALL vectorization, the LEN argument is the real + loop len produced by SELECT_VL or MIN wheras the + MASK argument here is the dummy mask. */ + vargs[varg++] + = build_minus_one_cst (truth_type_for (vectype_out)); + } + else if (masked_loop_p && reduc_idx >= 0) vargs[varg++] = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype_out, j); for (i = 0; i < nargs; i++) @@ -3716,10 +3775,20 @@ vectorizable_call (vec_info *vinfo, } vargs[varg++] = vec_defs[i][j]; } - if (masked_loop_p && reduc_idx >= 0) + if ((masked_loop_p || len_loop_p) && reduc_idx >= 0) vargs[varg++] = vargs[reduc_idx + 1]; - if (mask_opno >= 0 && masked_loop_p) + if (len_opno >= 0 && len_loop_p) + { + tree len = vect_get_loop_len (loop_vinfo, gsi, lens, ncopies, + vectype_out, j, 1); + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + tree bias = build_int_cst (intQI_type_node, biasval); + vargs[len_opno] = len; + vargs[len_opno + 1] = bias; + } + else if (mask_opno >= 0 && masked_loop_p) { tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype_out, j); -- 2.36.1