Robin Dapp <rd...@linux.ibm.com> writes: > Hi Kewen, Richard, > > thanks for the comments, I addressed them in the attached v4.
Sorry again for the slow review. I only have some very minor comments left: > Bootstrap and regtest are good as before. > > Regards > Robin > > diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md > index bf033e31c1c..dc2756f83e9 100644 > --- a/gcc/config/rs6000/vsx.md > +++ b/gcc/config/rs6000/vsx.md > @@ -5637,7 +5637,8 @@ > (define_expand "len_load_v16qi" > [(match_operand:V16QI 0 "vlogical_operand") > (match_operand:V16QI 1 "memory_operand") > - (match_operand:QI 2 "gpc_reg_operand")] > + (match_operand:QI 2 "gpc_reg_operand") > + (match_operand:QI 3 "zero_constant")] > "TARGET_P9_VECTOR && TARGET_64BIT" > { > rtx mem = XEXP (operands[1], 0); > @@ -5651,6 +5652,7 @@ > [(match_operand:V16QI 0 "memory_operand") > (match_operand:V16QI 1 "vlogical_operand") > (match_operand:QI 2 "gpc_reg_operand") > + (match_operand:QI 3 "zero_constant") > ] > "TARGET_P9_VECTOR && TARGET_64BIT" > { > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi > index 2b41cb7fb7b..8df61f578bd 100644 > --- a/gcc/doc/md.texi > +++ b/gcc/doc/md.texi > @@ -5213,7 +5213,10 @@ which must be a vector mode. Operand 2 has whichever > integer mode the > target prefers. If operand 2 exceeds the number of elements in mode > @var{m}, the behavior is undefined. If the target prefers the length > to be measured in bytes rather than elements, it should only implement > -this pattern for vectors of @code{QI} elements. > +this pattern for vectors of @code{QI} elements. Operand 3 specifies > +a bias predicate that determines whether a length of zero is permitted > +or not. If permitted, the predicate should only allow a zero immediate, > +otherwise it should only allow an immediate value of -1. I think it would be better to fold this into the existing documentation a bit more: -------------------------------------------------------------------------- Load (operand 2 - operand 3) elements from vector memory operand 1 into vector register operand 0, setting the other elements of operand 0 to undefined values. Operands 0 and 1 have mode @var{m}, which must be a vector mode. Operand 2 has whichever integer mode the target prefers. Operand 3 conceptually has mode @code{QI}. Operand 2 can be a variable or a constant amount. Operand 3 specifies a constant bias: it is either a constant 0 or a constant -1. The predicate on operand 3 must only accept the bias values that the target actually supports. GCC handles a bias of 0 more efficiently than a bias of -1. If (operand 2 - operand 3) exceeds the number of elements in mode @var{m}, the behavior is undefined. If the target prefers the length to be measured in bytes rather than elements, it should only implement this pattern for vectors of @code{QI} elements. -------------------------------------------------------------------------- > @@ -5226,7 +5229,10 @@ a vector mode. Operand 2 has whichever integer mode > the target prefers. > If operand 2 exceeds the number of elements in mode @var{m}, the behavior > is undefined. If the target prefers the length to be measured in bytes > rather than elements, it should only implement this pattern for vectors > -of @code{QI} elements. > +of @code{QI} elements. Operand 3 specifies a bias predicate that > +determines whether a length of zero is permitted or not. If permitted, > +the predicate should only allow a zero constant, otherwise it should > +only allow an immediate value of -1. > > This pattern is not allowed to @code{FAIL}. Similarly here I think we should say: -------------------------------------------------------------------------- Store (operand 2 - operand 3) vector elements from vector register operand 1 into memory operand 0, leaving the other elements of operand 0 unchanged. Operands 0 and 1 have mode @var{m}, which must be a vector mode. Operand 2 has whichever integer mode the target prefers. Operand 3 conceptually has mode @code{QI}. Operand 2 can be a variable or a constant amount. Operand 3 specifies a constant bias: it is either a constant 0 or a constant -1. The predicate on operand 3 must only accept the bias values that the target actually supports. GCC handles a bias of 0 more efficiently than a bias of -1. If (operand 2 - operand 3) exceeds the number of elements in mode @var{m}, the behavior is undefined. If the target prefers the length to be measured in bytes rather than elements, it should only implement this pattern for vectors of @code{QI} elements. -------------------------------------------------------------------------- > diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c > index 8312d08aab2..d45f080c06f 100644 > --- a/gcc/internal-fn.c > +++ b/gcc/internal-fn.c > @@ -2696,9 +2696,9 @@ expand_call_mem_ref (tree type, gcall *stmt, int index) > static void > expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > { > - class expand_operand ops[3]; > - tree type, lhs, rhs, maskt; > - rtx mem, target, mask; > + class expand_operand ops[4]; > + tree type, lhs, rhs, maskt, biast; > + rtx mem, target, mask, bias; > insn_code icode; > > maskt = gimple_call_arg (stmt, 2); > @@ -2723,11 +2723,20 @@ expand_partial_load_optab_fn (internal_fn, gcall > *stmt, convert_optab optab) > create_output_operand (&ops[0], target, TYPE_MODE (type)); > create_fixed_operand (&ops[1], mem); > if (optab == len_load_optab) > - create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE > (maskt)), > - TYPE_UNSIGNED (TREE_TYPE (maskt))); > + { > + create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE > (maskt)), > + TYPE_UNSIGNED (TREE_TYPE (maskt))); > + biast = gimple_call_arg (stmt, 3); > + bias = expand_normal (biast); > + create_input_operand (&ops[3], bias, QImode); > + expand_insn (icode, 4, ops); > + } > else > + { > create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt))); > - expand_insn (icode, 3, ops); > + expand_insn (icode, 3, ops); > + } > + Formatting nit: the “else” body should be indented like the “then” body. > if (!rtx_equal_p (target, ops[0].value)) > emit_move_insn (target, ops[0].value); > } > @@ -2741,9 +2750,9 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, > convert_optab optab) > static void > expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > { > - class expand_operand ops[3]; > - tree type, lhs, rhs, maskt; > - rtx mem, reg, mask; > + class expand_operand ops[4]; > + tree type, lhs, rhs, maskt, biast; > + rtx mem, reg, mask, bias; > insn_code icode; > > maskt = gimple_call_arg (stmt, 2); > @@ -2766,11 +2775,19 @@ expand_partial_store_optab_fn (internal_fn, gcall > *stmt, convert_optab optab) > create_fixed_operand (&ops[0], mem); > create_input_operand (&ops[1], reg, TYPE_MODE (type)); > if (optab == len_store_optab) > - create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE > (maskt)), > - TYPE_UNSIGNED (TREE_TYPE (maskt))); > + { > + create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE > (maskt)), > + TYPE_UNSIGNED (TREE_TYPE (maskt))); > + biast = gimple_call_arg (stmt, 4); > + bias = expand_normal (biast); > + create_input_operand (&ops[3], bias, QImode); > + expand_insn (icode, 4, ops); > + } > else > - create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt))); > - expand_insn (icode, 3, ops); > + { > + create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt))); > + expand_insn (icode, 3, ops); > + } > } > > #define expand_mask_store_optab_fn expand_partial_store_optab_fn > @@ -4172,6 +4189,29 @@ internal_check_ptrs_fn_supported_p (internal_fn ifn, > tree type, > && insn_operand_matches (icode, 4, GEN_INT (align))); > } > > +/* Return the supported bias for the len_load IFN. For now we only support “for IFN, which is either IFN_LEN_LOAD or IFN_LEN_STORE.” > + the biases of 0 and -1 (in case 0 is not an allowable length for > len_load). …or len_store > + If none of the biases match what the backend provides, return > + VECT_PARTIAL_BIAS_UNSUPPORTED. */ > + > +signed char > +internal_len_load_store_bias (internal_fn ifn, machine_mode mode) > +{ > + optab optab = direct_internal_fn_optab (ifn); > + insn_code icode = direct_optab_handler (optab, mode); > + > + if (icode != CODE_FOR_nothing) > + { > + /* For now we only support biases of 0 or -1. Try both of them. */ > + if (insn_operand_matches (icode, 3, GEN_INT (0))) > + return 0; > + if (insn_operand_matches (icode, 3, GEN_INT (-1))) > + return -1; > + } > + > + return VECT_PARTIAL_BIAS_UNSUPPORTED; > +} > + > /* Expand STMT as though it were a call to internal function FN. */ > > void > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h > index 19d0f849a5a..d46aa4ebf33 100644 > --- a/gcc/internal-fn.h > +++ b/gcc/internal-fn.h > @@ -227,6 +227,10 @@ extern bool internal_gather_scatter_fn_supported_p > (internal_fn, tree, > tree, tree, int); > extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree, > poly_uint64, unsigned int); > +#define VECT_PARTIAL_BIAS_UNSUPPORTED 127 > + > +extern signed char internal_len_load_store_bias (internal_fn ifn, > + machine_mode); > > extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree, > bool, bool, bool, bool, tree *); > diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c > index a94490373c3..c4a6492b50d 100644 > --- a/gcc/tree-ssa-math-opts.c > +++ b/gcc/tree-ssa-math-opts.c > @@ -2723,16 +2723,7 @@ convert_mult_to_widen (gimple *stmt, > gimple_stmt_iterator *gsi) > from_unsigned1 = from_unsigned2 = false; > } > else > - { > - /* Expand can synthesize smul_widen_optab if the target > - supports umul_widen_optab. */ > - op = umul_widen_optab; > - handler = find_widening_optab_handler_and_mode (op, to_mode, > - from_mode, > - &actual_mode); > - if (handler == CODE_FOR_nothing) > - return false; > - } > + return false; This part looks unrelated. > } > > /* Ensure that the inputs to the handler are in the correct precison > diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c > index 4988c93fdb6..4fd078d535a 100644 > --- a/gcc/tree-vect-loop-manip.c > +++ b/gcc/tree-vect-loop-manip.c > @@ -421,6 +421,7 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, > rgroup_controls *dest_rgm, > static tree > vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, > gimple_seq *preheader_seq, > + gimple_seq *header_seq, > gimple_stmt_iterator loop_cond_gsi, > rgroup_controls *rgc, tree niters, > tree niters_skip, bool might_wrap_p) > @@ -664,6 +665,19 @@ vect_set_loop_controls_directly (class loop *loop, > loop_vec_info loop_vinfo, > > vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl); > } > + > + int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > + if (partial_load_bias != 0) > + { > + tree adjusted_len = rgc->bias_adjusted_ctrl; > + gassign *minus = gimple_build_assign (adjusted_len, MINUS_EXPR, > + rgc->controls[0], > + build_int_cst > + (TREE_TYPE (rgc->controls[0]), > + -partial_load_bias)); It would be more canonical to use PLUS_EXPR and not negate the bias. MINUS_EXPR of a constant gets converted to PLUS_EXPR of the negative. > + gimple_seq_add_stmt (header_seq, minus); > + } > + > return next_ctrl; > } > > @@ -744,6 +758,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop, > /* Set up all controls for this group. */ > test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo, > &preheader_seq, > + &header_seq, > loop_cond_gsi, rgc, > niters, niters_skip, > might_wrap_p); > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c > index e94356d76e9..8eb1726b4f5 100644 > --- a/gcc/tree-vect-loop.c > +++ b/gcc/tree-vect-loop.c > @@ -1163,6 +1163,31 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo) > if (LOOP_VINFO_LENS (loop_vinfo).is_empty ()) > return false; > > + machine_mode len_load_mode = get_len_load_store_mode > + (loop_vinfo->vector_mode, true).require (); > + machine_mode len_store_mode = get_len_load_store_mode > + (loop_vinfo->vector_mode, false).require (); > + > + signed char partial_load_bias = internal_len_load_store_bias > + (IFN_LEN_LOAD, len_load_mode); > + > + signed char partial_store_bias = internal_len_load_store_bias > + (IFN_LEN_STORE, len_store_mode); > + > + gcc_assert (partial_load_bias == partial_store_bias); > + > + if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED) > + return false; > + > + /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit > + len_loads with a length of zero. In order to avoid that we prohibit > + more than one loop length here. */ > + if (partial_load_bias == -1 > + && LOOP_VINFO_LENS (loop_vinfo).length () > 1) > + return false; Nit: return is indented two spaces too far. > + > + LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias; > + > unsigned int max_nitems_per_iter = 1; > unsigned int i; > rgroup_controls *rgl; > @@ -4125,6 +4150,8 @@ vect_estimate_min_profitable_iters (loop_vec_info > loop_vinfo, > here. */ > > bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo); > + signed char partial_load_store_bias > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > bool need_iterate_p > = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) > && !vect_known_niters_smaller_than_vf (loop_vinfo)); > @@ -4157,6 +4184,11 @@ vect_estimate_min_profitable_iters (loop_vec_info > loop_vinfo, > for each since start index is zero. */ > prologue_stmts += num_vectors; > > + /* If we have a non-zero partial load bias, we need one MINUS Becomes PLUS after the change above. > + to adjust the load length. */ > + if (partial_load_store_bias != 0) > + body_stmts += 1; > + > /* Each may need two MINs and one MINUS to update lengths in body > for next iteration. */ > if (need_iterate_p) > @@ -9226,6 +9258,11 @@ vect_get_loop_len (loop_vec_info loop_vinfo, > vec_loop_lens *lens, > { > rgroup_controls *rgl = &(*lens)[nvectors - 1]; > > + signed char partial_load_store_bias = > + LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > + > + bool use_bias_adjusted_len = partial_load_store_bias != 0; > + > /* Populate the rgroup's len array, if this is the first time we've > used it. */ > if (rgl->controls.is_empty ()) > @@ -9235,6 +9272,15 @@ vect_get_loop_len (loop_vec_info loop_vinfo, > vec_loop_lens *lens, > { > tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo); > gcc_assert (len_type != NULL_TREE); > + > + if (i == 0 && use_bias_adjusted_len) > + { > + tree adjusted_len = > + make_temp_ssa_name (len_type, NULL, "adjusted_loop_len"); > + SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop (); > + rgl->bias_adjusted_ctrl = adjusted_len; > + } > + Think it would be better to make it: if (use_bias_adjusted_len) { gcc_assert (i == 0); But do we need to do this? Code should only care about the final value, so I didn't think we would need to keep the intermediate unbiased length alongside the biased one. (Or maybe we do. My memory is a bit rusty, sorry.) > tree len = make_temp_ssa_name (len_type, NULL, "loop_len"); > > /* Provide a dummy definition until the real one is available. */ > @@ -9243,7 +9289,10 @@ vect_get_loop_len (loop_vec_info loop_vinfo, > vec_loop_lens *lens, > } > } > > - return rgl->controls[index]; > + if (use_bias_adjusted_len) > + return rgl->bias_adjusted_ctrl; > + else > + return rgl->controls[index]; > } > > /* Scale profiling counters by estimation for LOOP which is vectorized > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > index 17849b575b7..006f4c31217 100644 > --- a/gcc/tree-vect-stmts.c > +++ b/gcc/tree-vect-stmts.c > @@ -8289,9 +8289,16 @@ vectorizable_store (vec_info *vinfo, > gsi); > vec_oprnd = var; > } > + > + /* Check which bias value to use. */ > + signed char biasval = internal_len_load_store_bias > + (IFN_LEN_STORE, new_vmode); I think we talked earlier about asserting that the biases agree with the one chosen by vect_verify_loop_lens. This is probably going back on an earlier decision, sorry, but I think we should either do that or (simpler) just use LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo). We'll then ICE later if the target doesn't accept the bias. > + > + tree bias = build_int_cst (intQI_type_node, biasval); > gcall *call > - = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr, > - ptr, final_len, vec_oprnd); > + = gimple_build_call_internal (IFN_LEN_STORE, 5, dataref_ptr, > + ptr, final_len, vec_oprnd, > + bias); > gimple_call_set_nothrow (call, true); > vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); > new_stmt = call; > @@ -9588,22 +9595,30 @@ vectorizable_load (vec_info *vinfo, > vec_num * j + i); > tree ptr = build_int_cst (ref_type, > align * BITS_PER_UNIT); > + > + machine_mode vmode = TYPE_MODE (vectype); > + opt_machine_mode new_ovmode > + = get_len_load_store_mode (vmode, true); > + machine_mode new_vmode = new_ovmode.require (); > + tree qi_type = unsigned_intQI_type_node; > + > + /* Check which bias value to use. */ > + signed char biasval = internal_len_load_store_bias > + (IFN_LEN_LOAD, new_vmode); Same here. Thanks, Richard > + > + tree bias = build_int_cst (intQI_type_node, biasval); > + > gcall *call > - = gimple_build_call_internal (IFN_LEN_LOAD, 3, > + = gimple_build_call_internal (IFN_LEN_LOAD, 4, > dataref_ptr, ptr, > - final_len); > + final_len, bias); > gimple_call_set_nothrow (call, true); > new_stmt = call; > data_ref = NULL_TREE; > > /* Need conversion if it's wrapped with VnQI. */ > - machine_mode vmode = TYPE_MODE (vectype); > - opt_machine_mode new_ovmode > - = get_len_load_store_mode (vmode, true); > - machine_mode new_vmode = new_ovmode.require (); > if (vmode != new_vmode) > { > - tree qi_type = unsigned_intQI_type_node; > tree new_vtype > = build_vector_type_for_mode (qi_type, new_vmode); > tree var = vect_get_new_ssa_name (new_vtype, > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index c4c5678e7f1..75fdfa31405 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -544,6 +544,10 @@ struct rgroup_controls { > > /* A vector of nV controls, in iteration order. */ > vec<tree> controls; > + > + /* In case of len_load and len_store with a bias there is only one > + rgroup. This holds the adjusted loop length for the this rgroup. */ > + tree bias_adjusted_ctrl; > }; > > typedef auto_vec<rgroup_controls> vec_loop_masks; > @@ -749,6 +753,11 @@ public: > epilogue of loop. */ > bool epil_using_partial_vectors_p; > > + /* The bias for len_load and len_store. For now, only 0 and -1 are > + supported. -1 must be used when a backend does not support > + len_load/len_store with a length of zero. */ > + signed char partial_load_store_bias; > + > /* When we have grouped data accesses with gaps, we may introduce invalid > memory accesses. We peel the last iteration of the loop to prevent > this. */ > @@ -814,6 +823,7 @@ public: > #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p > #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) > \ > (L)->epil_using_partial_vectors_p > +#define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias > #define LOOP_VINFO_VECT_FACTOR(L) (L)->vectorization_factor > #define LOOP_VINFO_MAX_VECT_FACTOR(L) (L)->max_vectorization_factor > #define LOOP_VINFO_MASKS(L) (L)->masks