On Tue, 4 Nov 2025, Avinash Jayakar wrote:

> Hi,
> 
> This is a follow-up to the previous patch I raised for fixing PR122065. Here I
> handle cases when vector constant is uniform, but may not be a power of 2.
> Bootstrapped and regtests on powerpc64le-linux. Kindly review.
> 
> Thanks and regards,
> Avinash Jayakar
> 
> Use sequences of shifts and add/sub if the hardware does not have support for
> vector multiplication. In a previous patch, bare bones vector lowering had 
> been
> implemented which only worked when the constant value was a power of 2.
> 
> In this patch, few more cases have been added, i.e., if a constant is a 
> uniform
> vector but not a power of 2 then use the choose_mult_variant, with max cost 
> estimate as the cost of scalar multiplication operation times the number of 
> elements in the vector. This is similar to the logic while expanding MULT_EXPR
> in expand pass or in the vector pattern recognition in tree-vect-patterns.cc.
> 
> 2025-11-04  Avinash Jayakar  <[email protected]>
> 
> gcc/ChangeLog:
>       PR tree-optimization/122065
>         * tree-vect-generic.cc (expand_vector_mult): Optimize mult when const
>         is uniform but not power of 2.
> ---
>  gcc/tree-vect-generic.cc | 151 +++++++++++++++++++++++++++++++++++++--
>  1 file changed, 144 insertions(+), 7 deletions(-)
> 
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> index b8e6a7168ff..285ae85c70f 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -497,27 +497,164 @@ add_shift (gimple_stmt_iterator *gsi, tree type, tree 
> op0, int *shiftcnts,
>  
>    return NULL_TREE;
>  }
> -/* Try to expand integer vector multiplication by constant of power 2 using
> -   left shifts.  */
> +/* For integer multiplications with exact power of 2, use left shifts.
> +   In constant is a uniform vector, then check if it can be synthesized with
> +   shifts and adds/subs.  Use scalar multiplication cost to compare with the
> +   synthesized vector code as done in expmed.cc.  */
>  static tree
>  expand_vector_mult (gimple_stmt_iterator *gsi, tree type, tree op0,
>                   tree op1)
>  {
> +  if (!CONSTANT_CLASS_P (op1))
> +    return NULL_TREE;
>    unsigned int nunits = nunits_for_known_piecewise_op (type);
>    int *shifts = XALLOCAVEC (int, nunits);
>  
> -  // if all element are same value and a power of 2, then we can use shifts
> +  bool perfect_pow2_p = true;
> +  bool uniform_const_p = true;
> +  unsigned HOST_WIDE_INT const_a;
>    for (unsigned int i = 0; i < nunits; i++)
>      {
>        tree cst = VECTOR_CST_ELT (op1, i);
> -      if ((TREE_CODE (cst) != INTEGER_CST || integer_zerop (cst))
> -       || !integer_pow2p (cst) || tree_int_cst_sgn (cst) != 1)
> +      if (TREE_CODE (cst) != INTEGER_CST || integer_zerop (cst)
> +       || tree_int_cst_sgn (cst) != 1)

You can use tree_int_cst_sng (cst) > 0, the function returns 0 for zero.

>       return NULL_TREE;
>  
> +      perfect_pow2_p &= integer_pow2p (cst);
> +
> +      if (i == 0)
> +     const_a = TREE_INT_CST_LOW (cst);
> +      else if (TREE_INT_CST_LOW (cst) != const_a)
> +     uniform_const_p = false;

You should check tree_fits_uhwi (cst), if you only use
HOST_WIDE_INTs you can do the zero check on the integer
value then.  tree_fits_uhwi also guarantees >= 0.

> +
>        shifts[i] = tree_log2 (cst);
>      }
> -  tree cur_op = add_shift (gsi, type, op0, shifts, LSHIFT_EXPR);
> -  return cur_op;
> +  if (perfect_pow2_p)
> +    return add_shift (gsi, type, op0, shifts, LSHIFT_EXPR);
> +  else if (uniform_const_p)
> +    {
> +      tree scalar_type = type;
> +      gcc_assert (TREE_CODE (type) == VECTOR_TYPE);
> +      scalar_type = TREE_TYPE (type);
> +
> +      // handle signed ints
> +      bool cast_to_unsigned_p = !TYPE_OVERFLOW_WRAPS (scalar_type);
> +      tree multtype = cast_to_unsigned_p
> +     ? unsigned_type_for (scalar_type) : scalar_type;
> +      tree vectype = build_vector_type_for_mode (multtype, TYPE_MODE (type));

You can use unsigned_type_for on the vector type directly.

> +
> +      // estimate the scalar MULT cost by generating dummy rtx insns
> +      optab op = optab_for_tree_code (MULT_EXPR, multtype, optab_default);
> +      if (op == unknown_optab)
> +     return NULL_TREE; // cannot estimate the cost
> +      enum insn_code icode = optab_handler (op, TYPE_MODE (multtype));
> +      if (icode == CODE_FOR_nothing)
> +     return NULL_TREE; // cannot estimate the cost

I think in this case we should continue lowering because it means
we'd end up with libcalls.  Not that I expect this would happen
in practice (I think nowhere else we bother with this situation when
estimating RTX costs from the gimple side).  So I'd say simply remove
this whole check.

> +      machine_mode mode = TYPE_MODE (multtype);
> +      rtx rop0 = gen_rtx_REG (mode, 0);
> +      rtx rop1 = gen_rtx_REG (mode, 1);
> +      rtx mult_rtx = gen_rtx_MULT (mode, rop0, rop1);
> +      int max_cost = nunits * rtx_cost (mult_rtx, mode, MULT, 0, true);
> +

I think you want to use set_src_cost here.

> +      struct algorithm alg;
> +      mult_variant variant;
> +
> +      bool possible = choose_mult_variant (TYPE_MODE (vectype), const_a, 
> &alg,
> +                                        &variant, max_cost);
> +      if (!possible)
> +     return NULL_TREE;
> +      if (cast_to_unsigned_p)
> +     {
> +       tree tmp_op = gimplify_build1 (gsi, CONVERT_EXPR, vectype, op0);

Use VIEW_CONVERT_EXPR here.

> +       op0 = tmp_op;
> +     }
> +      tree accumulator, tmp_var;
> +      if (alg.op[0] == alg_zero)
> +     accumulator = build_int_cst (vectype, 0);
> +      else
> +     accumulator = op0;
> +
> +      optab plus_op = optab_for_tree_code (PLUS_EXPR, vectype, optab_vector);
> +      bool plus_supported_p = (plus_op != unknown_optab
> +                            && can_implement_p (plus_op,
> +                                                TYPE_MODE (vectype)));
> +      optab minus_op = optab_for_tree_code (MINUS_EXPR, vectype, 
> optab_vector);
> +      bool minus_supported_p = (minus_op != unknown_optab
> +                              && can_implement_p (minus_op,
> +                                                  TYPE_MODE (vectype)));
> +      for (int i = 1; i < alg.ops; i++)
> +     {
> +       for (unsigned int j = 0; j < nunits; j++)
> +         shifts[j] = alg.log[i];
> +
> +       switch (alg.op[i])
> +         {
> +         case alg_shift:
> +           accumulator = add_shift (gsi, vectype, accumulator, shifts,
> +                                    LSHIFT_EXPR);
> +           break;
> +         case alg_add_t_m2:
> +            tmp_var = add_shift (gsi, vectype, op0, shifts, LSHIFT_EXPR);
> +           if (!plus_supported_p)
> +             return NULL_TREE;

Please avoid generating code (like the convert above or the shift here)
when we in the end fail.  Don't you have to check that the shift
was supported by checking the result from add_shift?

I see you use NEGATE_EXPR below, but I see no check for that.

Thanks,
Richard.

> +           accumulator = gimplify_build2 (gsi, PLUS_EXPR, vectype, tmp_var,
> +                                         accumulator);
> +           break;
> +         case alg_sub_t_m2:
> +           tmp_var = add_shift (gsi, vectype, op0, shifts, LSHIFT_EXPR);
> +           if (!minus_supported_p)
> +             return NULL_TREE;
> +           accumulator = gimplify_build2 (gsi, MINUS_EXPR, vectype,
> +                                          accumulator, tmp_var);
> +           break;
> +         case alg_add_t2_m:
> +           tmp_var = add_shift (gsi, vectype, accumulator, shifts,
> +                                LSHIFT_EXPR);
> +           if (!plus_supported_p)
> +             return NULL_TREE;
> +           accumulator = gimplify_build2 (gsi, PLUS_EXPR, vectype, tmp_var,
> +                                          op0);
> +           break;
> +         case alg_sub_t2_m:
> +           tmp_var = add_shift (gsi, vectype, accumulator, shifts,
> +                                LSHIFT_EXPR);
> +           if (!minus_supported_p)
> +             return NULL_TREE;
> +           accumulator = gimplify_build2 (gsi, MINUS_EXPR, vectype,
> +                                          tmp_var, op0);
> +           break;
> +         case alg_add_factor:
> +           tmp_var = add_shift (gsi, vectype, accumulator, shifts,
> +                                LSHIFT_EXPR);
> +           if (!plus_supported_p)
> +             return NULL_TREE;
> +           accumulator = gimplify_build2 (gsi, PLUS_EXPR, vectype,
> +                                          accumulator, tmp_var);
> +           break;
> +         case alg_sub_factor:
> +           tmp_var = add_shift (gsi, vectype, accumulator, shifts,
> +                                LSHIFT_EXPR);
> +           if (!minus_supported_p)
> +             return NULL_TREE;
> +           accumulator = gimplify_build2 (gsi, MINUS_EXPR, vectype,
> +                                          accumulator, tmp_var);
> +           break;
> +         default:
> +           gcc_unreachable ();
> +         }
> +     }
> +      if (variant == negate_variant)
> +     accumulator = gimplify_build1 (gsi, NEGATE_EXPR, vectype, accumulator);
> +      else if (variant == add_variant)
> +     accumulator = gimplify_build2 (gsi, PLUS_EXPR, vectype, accumulator,
> +                                    op0);
> +
> +      if (cast_to_unsigned_p)
> +     accumulator = gimplify_build1 (gsi, CONVERT_EXPR, vectype, accumulator);
> +      return accumulator;
> +    }
> +  return NULL_TREE;
>  }
>  /* Try to expand integer vector division by constant using
>     widening multiply, shifts and additions.  */
> 

-- 
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Jochen Jaser, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to