Robin Dapp <rd...@linux.ibm.com> writes:
> Hi Kewen, Richard,
>
> thanks for the comments, I addressed them in the attached v4.

Sorry again for the slow review.  I only have some very minor
comments left:

> Bootstrap and regtest are good as before.
>
> Regards
>  Robin
>
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index bf033e31c1c..dc2756f83e9 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -5637,7 +5637,8 @@
>  (define_expand "len_load_v16qi"
>    [(match_operand:V16QI 0 "vlogical_operand")
>     (match_operand:V16QI 1 "memory_operand")
> -   (match_operand:QI 2 "gpc_reg_operand")]
> +   (match_operand:QI 2 "gpc_reg_operand")
> +   (match_operand:QI 3 "zero_constant")]
>    "TARGET_P9_VECTOR && TARGET_64BIT"
>  {
>    rtx mem = XEXP (operands[1], 0);
> @@ -5651,6 +5652,7 @@
>    [(match_operand:V16QI 0 "memory_operand")
>     (match_operand:V16QI 1 "vlogical_operand")
>     (match_operand:QI 2 "gpc_reg_operand")
> +   (match_operand:QI 3 "zero_constant")
>    ]
>    "TARGET_P9_VECTOR && TARGET_64BIT"
>  {
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 2b41cb7fb7b..8df61f578bd 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5213,7 +5213,10 @@ which must be a vector mode.  Operand 2 has whichever 
> integer mode the
>  target prefers.  If operand 2 exceeds the number of elements in mode
>  @var{m}, the behavior is undefined.  If the target prefers the length
>  to be measured in bytes rather than elements, it should only implement
> -this pattern for vectors of @code{QI} elements.
> +this pattern for vectors of @code{QI} elements.  Operand 3 specifies
> +a bias predicate that determines whether a length of zero is permitted
> +or not.  If permitted, the predicate should only allow a zero immediate,
> +otherwise it should only allow an immediate value of -1.

I think it would be better to fold this into the existing documentation
a bit more:

--------------------------------------------------------------------------
Load (operand 2 - operand 3) elements from vector memory operand 1 into
vector register operand 0, setting the other elements of operand 0 to
undefined values.  Operands 0 and 1 have mode @var{m}, which must be a
vector mode.  Operand 2 has whichever integer mode the target prefers.
Operand 3 conceptually has mode @code{QI}.

Operand 2 can be a variable or a constant amount.  Operand 3 specifies
a constant bias: it is either a constant 0 or a constant -1.  The predicate
on operand 3 must only accept the bias values that the target actually
supports.  GCC handles a bias of 0 more efficiently than a bias of -1.

If (operand 2 - operand 3) exceeds the number of elements in mode @var{m},
the behavior is undefined.

If the target prefers the length to be measured in bytes rather than
elements, it should only implement this pattern for vectors of @code{QI}
elements.
--------------------------------------------------------------------------

> @@ -5226,7 +5229,10 @@ a vector mode.  Operand 2 has whichever integer mode 
> the target prefers.
>  If operand 2 exceeds the number of elements in mode @var{m}, the behavior
>  is undefined.  If the target prefers the length to be measured in bytes
>  rather than elements, it should only implement this pattern for vectors
> -of @code{QI} elements.
> +of @code{QI} elements.  Operand 3 specifies a bias predicate that
> +determines whether a length of zero is permitted or not.  If permitted,
> +the predicate should only allow a zero constant, otherwise it should
> +only allow an immediate value of -1.
>  
>  This pattern is not allowed to @code{FAIL}.

Similarly here I think we should say:

--------------------------------------------------------------------------
Store (operand 2 - operand 3) vector elements from vector register
operand 1 into memory operand 0, leaving the other elements of operand 0
unchanged.  Operands 0 and 1 have mode @var{m}, which must be a vector mode.
Operand 2 has whichever integer mode the target prefers.  Operand 3
conceptually has mode @code{QI}.

Operand 2 can be a variable or a constant amount.  Operand 3 specifies
a constant bias: it is either a constant 0 or a constant -1.  The predicate
on operand 3 must only accept the bias values that the target actually
supports.  GCC handles a bias of 0 more efficiently than a bias of -1.

If (operand 2 - operand 3) exceeds the number of elements in mode @var{m},
the behavior is undefined.

If the target prefers the length to be measured in bytes rather than
elements, it should only implement this pattern for vectors of @code{QI}
elements.
--------------------------------------------------------------------------

> diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
> index 8312d08aab2..d45f080c06f 100644
> --- a/gcc/internal-fn.c
> +++ b/gcc/internal-fn.c
> @@ -2696,9 +2696,9 @@ expand_call_mem_ref (tree type, gcall *stmt, int index)
>  static void
>  expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  {
> -  class expand_operand ops[3];
> -  tree type, lhs, rhs, maskt;
> -  rtx mem, target, mask;
> +  class expand_operand ops[4];
> +  tree type, lhs, rhs, maskt, biast;
> +  rtx mem, target, mask, bias;
>    insn_code icode;
>  
>    maskt = gimple_call_arg (stmt, 2);
> @@ -2723,11 +2723,20 @@ expand_partial_load_optab_fn (internal_fn, gcall 
> *stmt, convert_optab optab)
>    create_output_operand (&ops[0], target, TYPE_MODE (type));
>    create_fixed_operand (&ops[1], mem);
>    if (optab == len_load_optab)
> -    create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE 
> (maskt)),
> -                              TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +    {
> +      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE 
> (maskt)),
> +                                TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +      biast = gimple_call_arg (stmt, 3);
> +      bias = expand_normal (biast);
> +      create_input_operand (&ops[3], bias, QImode);
> +      expand_insn (icode, 4, ops);
> +    }
>    else
> +    {
>      create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> -  expand_insn (icode, 3, ops);
> +    expand_insn (icode, 3, ops);
> +    }
> +

Formatting nit: the “else” body should be indented like the “then” body.

>    if (!rtx_equal_p (target, ops[0].value))
>      emit_move_insn (target, ops[0].value);
>  }
> @@ -2741,9 +2750,9 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, 
> convert_optab optab)
>  static void
>  expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  {
> -  class expand_operand ops[3];
> -  tree type, lhs, rhs, maskt;
> -  rtx mem, reg, mask;
> +  class expand_operand ops[4];
> +  tree type, lhs, rhs, maskt, biast;
> +  rtx mem, reg, mask, bias;
>    insn_code icode;
>  
>    maskt = gimple_call_arg (stmt, 2);
> @@ -2766,11 +2775,19 @@ expand_partial_store_optab_fn (internal_fn, gcall 
> *stmt, convert_optab optab)
>    create_fixed_operand (&ops[0], mem);
>    create_input_operand (&ops[1], reg, TYPE_MODE (type));
>    if (optab == len_store_optab)
> -    create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE 
> (maskt)),
> -                              TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +    {
> +      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE 
> (maskt)),
> +                                TYPE_UNSIGNED (TREE_TYPE (maskt)));
> +      biast = gimple_call_arg (stmt, 4);
> +      bias = expand_normal (biast);
> +      create_input_operand (&ops[3], bias, QImode);
> +      expand_insn (icode, 4, ops);
> +    }
>    else
> -    create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> -  expand_insn (icode, 3, ops);
> +    {
> +      create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
> +      expand_insn (icode, 3, ops);
> +    }
>  }
>  
>  #define expand_mask_store_optab_fn expand_partial_store_optab_fn
> @@ -4172,6 +4189,29 @@ internal_check_ptrs_fn_supported_p (internal_fn ifn, 
> tree type,
>         && insn_operand_matches (icode, 4, GEN_INT (align)));
>  }
>  
> +/* Return the supported bias for the len_load IFN.  For now we only support

“for IFN, which is either IFN_LEN_LOAD or IFN_LEN_STORE.”

> +   the biases of 0 and -1 (in case 0 is not an allowable length for 
> len_load).

…or len_store

> +   If none of the biases match what the backend provides, return
> +   VECT_PARTIAL_BIAS_UNSUPPORTED.  */
> +
> +signed char
> +internal_len_load_store_bias (internal_fn ifn, machine_mode mode)
> +{
> +  optab optab = direct_internal_fn_optab (ifn);
> +  insn_code icode = direct_optab_handler (optab, mode);
> +
> +  if (icode != CODE_FOR_nothing)
> +    {
> +      /* For now we only support biases of 0 or -1.  Try both of them.  */
> +      if (insn_operand_matches (icode, 3, GEN_INT (0)))
> +     return 0;
> +      if (insn_operand_matches (icode, 3, GEN_INT (-1)))
> +     return -1;
> +    }
> +
> +  return VECT_PARTIAL_BIAS_UNSUPPORTED;
> +}
> +
>  /* Expand STMT as though it were a call to internal function FN.  */
>  
>  void
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 19d0f849a5a..d46aa4ebf33 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -227,6 +227,10 @@ extern bool internal_gather_scatter_fn_supported_p 
> (internal_fn, tree,
>                                                   tree, tree, int);
>  extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
>                                               poly_uint64, unsigned int);
> +#define VECT_PARTIAL_BIAS_UNSUPPORTED 127
> +
> +extern signed char internal_len_load_store_bias (internal_fn ifn,
> +                                              machine_mode);
>  
>  extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree,
>                                   bool, bool, bool, bool, tree *);
> diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
> index a94490373c3..c4a6492b50d 100644
> --- a/gcc/tree-ssa-math-opts.c
> +++ b/gcc/tree-ssa-math-opts.c
> @@ -2723,16 +2723,7 @@ convert_mult_to_widen (gimple *stmt, 
> gimple_stmt_iterator *gsi)
>         from_unsigned1 = from_unsigned2 = false;
>       }
>        else
> -     {
> -       /* Expand can synthesize smul_widen_optab if the target
> -          supports umul_widen_optab.  */
> -       op = umul_widen_optab;
> -       handler = find_widening_optab_handler_and_mode (op, to_mode,
> -                                                       from_mode,
> -                                                       &actual_mode);
> -       if (handler == CODE_FOR_nothing)
> -         return false;
> -     }
> +     return false;

This part looks unrelated.

>      }
>  
>    /* Ensure that the inputs to the handler are in the correct precison
> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> index 4988c93fdb6..4fd078d535a 100644
> --- a/gcc/tree-vect-loop-manip.c
> +++ b/gcc/tree-vect-loop-manip.c
> @@ -421,6 +421,7 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, 
> rgroup_controls *dest_rgm,
>  static tree
>  vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>                                gimple_seq *preheader_seq,
> +                              gimple_seq *header_seq,
>                                gimple_stmt_iterator loop_cond_gsi,
>                                rgroup_controls *rgc, tree niters,
>                                tree niters_skip, bool might_wrap_p)
> @@ -664,6 +665,19 @@ vect_set_loop_controls_directly (class loop *loop, 
> loop_vec_info loop_vinfo,
>  
>        vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
>      }
> +
> +  int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +  if (partial_load_bias != 0)
> +    {
> +      tree adjusted_len = rgc->bias_adjusted_ctrl;
> +      gassign *minus = gimple_build_assign (adjusted_len, MINUS_EXPR,
> +                                         rgc->controls[0],
> +                                         build_int_cst
> +                                         (TREE_TYPE (rgc->controls[0]),
> +                                          -partial_load_bias));

It would be more canonical to use PLUS_EXPR and not negate the bias.
MINUS_EXPR of a constant gets converted to PLUS_EXPR of the negative.

> +      gimple_seq_add_stmt (header_seq, minus);
> +    }
> +
>    return next_ctrl;
>  }
>  
> @@ -744,6 +758,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>       /* Set up all controls for this group.  */
>       test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
>                                                    &preheader_seq,
> +                                                  &header_seq,
>                                                    loop_cond_gsi, rgc,
>                                                    niters, niters_skip,
>                                                    might_wrap_p);
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index e94356d76e9..8eb1726b4f5 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -1163,6 +1163,31 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo)
>    if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
>      return false;
>  
> +  machine_mode len_load_mode = get_len_load_store_mode
> +    (loop_vinfo->vector_mode, true).require ();
> +  machine_mode len_store_mode = get_len_load_store_mode
> +    (loop_vinfo->vector_mode, false).require ();
> +
> +  signed char partial_load_bias = internal_len_load_store_bias
> +    (IFN_LEN_LOAD, len_load_mode);
> +
> +  signed char partial_store_bias = internal_len_load_store_bias
> +    (IFN_LEN_STORE, len_store_mode);
> +
> +  gcc_assert (partial_load_bias == partial_store_bias);
> +
> +  if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
> +    return false;
> +
> +  /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
> +     len_loads with a length of zero.  In order to avoid that we prohibit
> +     more than one loop length here.  */
> +  if (partial_load_bias == -1
> +      && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
> +      return false;

Nit: return is indented two spaces too far.

> +
> +  LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
> +
>    unsigned int max_nitems_per_iter = 1;
>    unsigned int i;
>    rgroup_controls *rgl;
> @@ -4125,6 +4150,8 @@ vect_estimate_min_profitable_iters (loop_vec_info 
> loop_vinfo,
>        here.  */
>  
>        bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
> +      signed char partial_load_store_bias
> +     = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>        bool need_iterate_p
>       = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>          && !vect_known_niters_smaller_than_vf (loop_vinfo));
> @@ -4157,6 +4184,11 @@ vect_estimate_min_profitable_iters (loop_vec_info 
> loop_vinfo,
>              for each since start index is zero.  */
>           prologue_stmts += num_vectors;
>  
> +         /* If we have a non-zero partial load bias, we need one MINUS

Becomes PLUS after the change above.

> +            to adjust the load length.  */
> +         if (partial_load_store_bias != 0)
> +           body_stmts += 1;
> +
>           /* Each may need two MINs and one MINUS to update lengths in body
>              for next iteration.  */
>           if (need_iterate_p)
> @@ -9226,6 +9258,11 @@ vect_get_loop_len (loop_vec_info loop_vinfo, 
> vec_loop_lens *lens,
>  {
>    rgroup_controls *rgl = &(*lens)[nvectors - 1];
>  
> +  signed char partial_load_store_bias =
> +    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +
> +  bool use_bias_adjusted_len = partial_load_store_bias != 0;
> +
>    /* Populate the rgroup's len array, if this is the first time we've
>       used it.  */
>    if (rgl->controls.is_empty ())
> @@ -9235,6 +9272,15 @@ vect_get_loop_len (loop_vec_info loop_vinfo, 
> vec_loop_lens *lens,
>       {
>         tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
>         gcc_assert (len_type != NULL_TREE);
> +
> +       if (i == 0 && use_bias_adjusted_len)
> +         {
> +           tree adjusted_len =
> +             make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
> +           SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
> +           rgl->bias_adjusted_ctrl = adjusted_len;
> +         }
> +

Think it would be better to make it:

          if (use_bias_adjusted_len)
            {
              gcc_assert (i == 0);

But do we need to do this?  Code should only care about the final value,
so I didn't think we would need to keep the intermediate unbiased length
alongside the biased one.  (Or maybe we do.  My memory is a bit rusty,
sorry.)

>         tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
>  
>         /* Provide a dummy definition until the real one is available.  */
> @@ -9243,7 +9289,10 @@ vect_get_loop_len (loop_vec_info loop_vinfo, 
> vec_loop_lens *lens,
>       }
>      }
>  
> -  return rgl->controls[index];
> +  if (use_bias_adjusted_len)
> +    return rgl->bias_adjusted_ctrl;
> +  else
> +    return rgl->controls[index];
>  }
>  
>  /* Scale profiling counters by estimation for LOOP which is vectorized
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 17849b575b7..006f4c31217 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -8289,9 +8289,16 @@ vectorizable_store (vec_info *vinfo,
>                                                  gsi);
>                     vec_oprnd = var;
>                   }
> +
> +               /* Check which bias value to use.  */
> +               signed char biasval = internal_len_load_store_bias
> +                 (IFN_LEN_STORE, new_vmode);

I think we talked earlier about asserting that the biases agree
with the one chosen by vect_verify_loop_lens.

This is probably going back on an earlier decision, sorry, but I think
we should either do that or (simpler) just use
LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo).  We'll then ICE later
if the target doesn't accept the bias.

> +
> +               tree bias = build_int_cst (intQI_type_node, biasval);
>                 gcall *call
> -                 = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr,
> -                                               ptr, final_len, vec_oprnd);
> +                 = gimple_build_call_internal (IFN_LEN_STORE, 5, dataref_ptr,
> +                                               ptr, final_len, vec_oprnd,
> +                                               bias);
>                 gimple_call_set_nothrow (call, true);
>                 vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
>                 new_stmt = call;
> @@ -9588,22 +9595,30 @@ vectorizable_load (vec_info *vinfo,
>                                              vec_num * j + i);
>                       tree ptr = build_int_cst (ref_type,
>                                                 align * BITS_PER_UNIT);
> +
> +                     machine_mode vmode = TYPE_MODE (vectype);
> +                     opt_machine_mode new_ovmode
> +                       = get_len_load_store_mode (vmode, true);
> +                     machine_mode new_vmode = new_ovmode.require ();
> +                     tree qi_type = unsigned_intQI_type_node;
> +
> +                     /* Check which bias value to use.  */
> +                     signed char biasval = internal_len_load_store_bias
> +                       (IFN_LEN_LOAD, new_vmode);

Same here.

Thanks,
Richard

> +
> +                     tree bias = build_int_cst (intQI_type_node, biasval);
> +
>                       gcall *call
> -                       = gimple_build_call_internal (IFN_LEN_LOAD, 3,
> +                       = gimple_build_call_internal (IFN_LEN_LOAD, 4,
>                                                       dataref_ptr, ptr,
> -                                                     final_len);
> +                                                     final_len, bias);
>                       gimple_call_set_nothrow (call, true);
>                       new_stmt = call;
>                       data_ref = NULL_TREE;
>  
>                       /* Need conversion if it's wrapped with VnQI.  */
> -                     machine_mode vmode = TYPE_MODE (vectype);
> -                     opt_machine_mode new_ovmode
> -                       = get_len_load_store_mode (vmode, true);
> -                     machine_mode new_vmode = new_ovmode.require ();
>                       if (vmode != new_vmode)
>                         {
> -                         tree qi_type = unsigned_intQI_type_node;
>                           tree new_vtype
>                             = build_vector_type_for_mode (qi_type, new_vmode);
>                           tree var = vect_get_new_ssa_name (new_vtype,
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index c4c5678e7f1..75fdfa31405 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -544,6 +544,10 @@ struct rgroup_controls {
>  
>    /* A vector of nV controls, in iteration order.  */
>    vec<tree> controls;
> +
> +  /* In case of len_load and len_store with a bias there is only one
> +     rgroup.  This holds the adjusted loop length for the this rgroup.  */
> +  tree bias_adjusted_ctrl;
>  };
>  
>  typedef auto_vec<rgroup_controls> vec_loop_masks;
> @@ -749,6 +753,11 @@ public:
>       epilogue of loop.  */
>    bool epil_using_partial_vectors_p;
>  
> +  /* The bias for len_load and len_store.  For now, only 0 and -1 are
> +     supported.  -1 must be used when a backend does not support
> +     len_load/len_store with a length of zero.  */
> +  signed char partial_load_store_bias;
> +
>    /* When we have grouped data accesses with gaps, we may introduce invalid
>       memory accesses.  We peel the last iteration of the loop to prevent
>       this.  */
> @@ -814,6 +823,7 @@ public:
>  #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
>  #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                           
>   \
>    (L)->epil_using_partial_vectors_p
> +#define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
>  #define LOOP_VINFO_VECT_FACTOR(L)          (L)->vectorization_factor
>  #define LOOP_VINFO_MAX_VECT_FACTOR(L)      (L)->max_vectorization_factor
>  #define LOOP_VINFO_MASKS(L)                (L)->masks

Reply via email to