Tamar Christina <tamar.christ...@arm.com> writes:
> Hi,
>
> Here's the respun patch.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>       PR target/108583
>       * target.def (preferred_div_as_shifts_over_mult): New.
>       * doc/tm.texi.in: Document it.
>       * doc/tm.texi: Regenerate.
>       * targhooks.cc (default_preferred_div_as_shifts_over_mult): New.
>       * targhooks.h (default_preferred_div_as_shifts_over_mult): New.
>       * tree-vect-patterns.cc (vect_recog_divmod_pattern): Use it.
>
> gcc/testsuite/ChangeLog:
>
>       PR target/108583
>       * gcc.dg/vect/vect-div-bitmask-4.c: New test.
>       * gcc.dg/vect/vect-div-bitmask-5.c: New test.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 
> 50a8872a6695b18b9bed0d393bacf733833633db..bf7269e323de1a065d4d04376e5a2703cbb0f9fa
>  100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -6137,6 +6137,12 @@ instruction pattern.  There is no need for the hook to 
> handle these two
>  implementation approaches itself.
>  @end deftypefn
>  
> +@deftypefn {Target Hook} bool 
> TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT (const_tree @var{type})
> +Sometimes it is possible to implement a vector division using a sequence
> +of two addition-shift pairs, giving four instructions in total.
> +Return true if taking this approach for @var{vectype} is likely
> +to be better than using a sequence involving highpart multiplication.
> +Default is false if @code{can_mult_highpart_p}, otherwise true.
>  @end deftypefn
>  
>  @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION 
> (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index 
> 3e07978a02f4e6077adae6cadc93ea4273295f1f..0051017a7fd67691a343470f36ad4fc32c8e7e15
>  100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -4173,6 +4173,7 @@ address;  but often a machine-dependent strategy can 
> generate better code.
>  
>  @hook TARGET_VECTORIZE_VEC_PERM_CONST
>  
> +@hook TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
>  
>  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
>  
> diff --git a/gcc/target.def b/gcc/target.def
> index 
> e0a5c7adbd962f5d08ed08d1d81afa2c2baa64a5..e4474a3ed6bd2f5f5c010bf0d40c2a371370490c
>  100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -1868,6 +1868,18 @@ correct for most targets.",
>   poly_uint64, (const_tree type),
>   default_preferred_vector_alignment)
>  
> +/* Returns whether the target has a preference for decomposing divisions 
> using
> +   shifts rather than multiplies.  */
> +DEFHOOK
> +(preferred_div_as_shifts_over_mult,
> + "Sometimes it is possible to implement a vector division using a sequence\n\
> +of two addition-shift pairs, giving four instructions in total.\n\
> +Return true if taking this approach for @var{vectype} is likely\n\
> +to be better than using a sequence involving highpart multiplication.\n\
> +Default is false if @code{can_mult_highpart_p}, otherwise true.",
> + bool, (const_tree type),
> + default_preferred_div_as_shifts_over_mult)
> +
>  /* Return true if vector alignment is reachable (by peeling N
>     iterations) for the given scalar type.  */
>  DEFHOOK
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index 
> a6a4809ca91baa5d7fad2244549317a31390f0c2..a207963b9e6eb9300df0043e1b79aa6c941d0f7f
>  100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -53,6 +53,8 @@ extern scalar_int_mode default_unwind_word_mode (void);
>  extern unsigned HOST_WIDE_INT default_shift_truncation_mask
>    (machine_mode);
>  extern unsigned int default_min_divisions_for_recip_mul (machine_mode);
> +extern bool default_preferred_div_as_shifts_over_mult
> +  (const_tree);
>  extern int default_mode_rep_extended (scalar_int_mode, scalar_int_mode);
>  
>  extern tree default_stack_protect_guard (void);
> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> index 
> 211525720a620d6f533e2da91e03877337a931e7..7f39ff9b7ec2bf66625d48a47bb76e96c05a3233
>  100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1483,6 +1483,15 @@ default_preferred_vector_alignment (const_tree type)
>    return TYPE_ALIGN (type);
>  }
>  
> +/* The default implementation of
> +   TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
> +
> +bool
> +default_preferred_div_as_shifts_over_mult (const_tree type)
> +{
> +  return can_mult_highpart_p (TYPE_MODE (type), TYPE_UNSIGNED (type));

The return value should be inverted.

> +}
> +
>  /* By default assume vectors of element TYPE require a multiple of the 
> natural
>     alignment of TYPE.  TYPE is naturally aligned if IS_PACKED is false.  */
>  bool
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c 
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..c81f8946922250234bf759e0a0a04ea8c1f73e3c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +typedef unsigned __attribute__((__vector_size__ (16))) V;
> +
> +static __attribute__((__noinline__)) __attribute__((__noclone__)) V
> +foo (V v, unsigned short i)
> +{
> +  v /= i;
> +  return v;
> +}
> +
> +int
> +main (void)
> +{
> +  V v = foo ((V) { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, 0xffff);
> +  for (unsigned i = 0; i < sizeof (v) / sizeof (v[0]); i++)
> +    if (v[i] != 0x00010001)
> +      __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" 
> "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c 
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..b4eb1a4dacba481e6306b49914d2a29b933de625
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
> @@ -0,0 +1,58 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include <stdio.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint8_t 
> +
> +#ifndef DEBUG
> +#define DEBUG 0
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] + level) / 0xff;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] + level) / 0xff;
> +}
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N / 2, N);
> +  fun2 (b, N / 2, N);
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "divmod pattern recognized" "vect" { target 
> aarch64*-*-* } } } */
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index 
> 1766ce277d6b88d8aa3be77e7c8abb504a10a735..183f1a623fbde34f505259cf8f4fb4d34e069614
>  100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -3914,6 +3914,83 @@ vect_recog_divmod_pattern (vec_info *vinfo,
>        return pattern_stmt;
>      }
>  
> +  if ((cst = uniform_integer_cst_p (oprnd1))
> +        && TYPE_UNSIGNED (itype)
> +        && rhs_code == TRUNC_DIV_EXPR
> +        && vectype
> +        && targetm.vectorize.preferred_div_as_shifts_over_mult (vectype))

Needs reindenting.

OK with those changes, thanks.

Richard

> +    {
> +      /* We can use the relationship:
> +
> +        x // N == ((x+N+2) // (N+1) + x) // (N+1)  for 0 <= x < N(N+3)
> +
> +      to optimize cases where N+1 is a power of 2, and where // (N+1)
> +      is therefore a shift right.  When operating in modes that are
> +      multiples of a byte in size, there are two cases:
> +
> +      (1) N(N+3) is not representable, in which case the question
> +          becomes whether the replacement expression overflows.
> +          It is enough to test that x+N+2 does not overflow,
> +          i.e. that x < MAX-(N+1).
> +
> +      (2) N(N+3) is representable, in which case it is the (only)
> +          bound that we need to check.
> +
> +      ??? For now we just handle the case where // (N+1) is a shift
> +      right by half the precision, since some architectures can
> +      optimize the associated addition and shift combinations
> +      into single instructions.  */
> +
> +      auto wcst = wi::to_wide (cst);
> +      int pow = wi::exact_log2 (wcst + 1);
> +      if (pow == prec / 2)
> +     {
> +       gimple *stmt = SSA_NAME_DEF_STMT (oprnd0);
> +
> +       gimple_ranger ranger;
> +       int_range_max r;
> +
> +       /* Check that no overflow will occur.  If we don't have range
> +          information we can't perform the optimization.  */
> +
> +       if (ranger.range_of_expr (r, oprnd0, stmt))
> +         {
> +           wide_int max = r.upper_bound ();
> +           wide_int one = wi::shwi (1, prec);
> +           wide_int adder = wi::add (one, wi::lshift (one, pow));
> +           wi::overflow_type ovf;
> +           wi::add (max, adder, UNSIGNED, &ovf);
> +           if (ovf == wi::OVF_NONE)
> +             {
> +               *type_out = vectype;
> +               tree tadder = wide_int_to_tree (itype, adder);
> +               tree rshift = wide_int_to_tree (itype, pow);
> +
> +               tree new_lhs1 = vect_recog_temp_ssa_var (itype, NULL);
> +               gassign *patt1
> +                 = gimple_build_assign (new_lhs1, PLUS_EXPR, oprnd0, tadder);
> +               append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
> +
> +               tree new_lhs2 = vect_recog_temp_ssa_var (itype, NULL);
> +               patt1 = gimple_build_assign (new_lhs2, RSHIFT_EXPR, new_lhs1,
> +                                            rshift);
> +               append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
> +
> +               tree new_lhs3 = vect_recog_temp_ssa_var (itype, NULL);
> +               patt1 = gimple_build_assign (new_lhs3, PLUS_EXPR, new_lhs2,
> +                                            oprnd0);
> +               append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
> +
> +               tree new_lhs4 = vect_recog_temp_ssa_var (itype, NULL);
> +               pattern_stmt = gimple_build_assign (new_lhs4, RSHIFT_EXPR,
> +                                                   new_lhs3, rshift);
> +
> +               return pattern_stmt;
> +             }
> +         }
> +     }
> +    }
> +
>    if (prec > HOST_BITS_PER_WIDE_INT
>        || integer_zerop (oprnd1))
>      return NULL;

Reply via email to