Re: [PATCH 1/2]middle-end Support optimized division by pow2 bitmask

Richard Biener via Gcc-patches Mon, 13 Jun 2022 02:39:21 -0700

On Mon, 13 Jun 2022, Richard Biener wrote:

> On Thu, 9 Jun 2022, Tamar Christina wrote:
> 
> > Hi All,
> > 
> > In plenty of image and video processing code it's common to modify pixel 
> > values
> > by a widening operation and then scale them back into range by dividing by 
> > 255.
> > 
> > This patch adds an optab to allow us to emit an optimized sequence when 
> > doing
> > an unsigned division that is equivalent to:
> > 
> >    x = y / (2 ^ (bitsize (y)/2)-1
> > 
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> > 
> > Ok for master?
> 
> Looking at 2/2 it seems that this is the wrong way to attack the
> problem.  The ISA doesn't have such instruction so adding an optab
> looks premature.  I suppose that there's no unsigned vector integer
> division and thus we open-code that in a different way?  Isn't the
> correct thing then to fixup that open-coding if it is more efficient?


Btw, on x86 we use

t.c:3:21: note:   replacing earlier pattern patt_25 = patt_28 / 255;
t.c:3:21: note:   with patt_25 = patt_19 >> 7;
t.c:3:21: note:   extra pattern stmt: patt_19 = patt_28 h* 32897;

which translates to

        vpmulhuw        %ymm4, %ymm0, %ymm0
        vpmulhuw        %ymm4, %ymm1, %ymm1
        vpsrlw  $7, %ymm0, %ymm0
        vpsrlw  $7, %ymm1, %ymm1

there's odd

        vpand   %ymm0, %ymm3, %ymm0
        vpand   %ymm1, %ymm3, %ymm1

before (%ymm3 is all 0x00ff)

        vpackuswb       %ymm1, %ymm0, %ymm0

that's not visible in GIMPLE.  I guess aarch64 lacks a highpart
multiply here?  In any case, it seems that generic division expansion
could be improved here? (choose_multiplier?)

Richard.

> Richard.
> 
> > Thanks,
> > Tamar
> > 
> > gcc/ChangeLog:
> > 
> >     * internal-fn.def (DIV_POW2_BITMASK): New.
> >     * optabs.def (udiv_pow2_bitmask_optab): New.
> >     * doc/md.texi: Document it.
> >     * tree-vect-patterns.cc (vect_recog_divmod_pattern): Recognize pattern.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> >     * gcc.dg/vect/vect-div-bitmask-1.c: New test.
> >     * gcc.dg/vect/vect-div-bitmask-2.c: New test.
> >     * gcc.dg/vect/vect-div-bitmask-3.c: New test.
> >     * gcc.dg/vect/vect-div-bitmask.h: New file.
> > 
> > --- inline copy of patch -- 
> > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> > index 
> > f3619c505c025f158c2bc64756531877378b22e1..784c49d7d24cef7619e4d613f7b4f6e945866c38
> >  100644
> > --- a/gcc/doc/md.texi
> > +++ b/gcc/doc/md.texi
> > @@ -5588,6 +5588,18 @@ signed op0, op1;
> >  op0 = op1 / (1 << imm);
> >  @end smallexample
> >  
> > +@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
> > +@item @samp{udiv_pow2_bitmask@var{m2}}
> > +@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
> > +@itemx @samp{udiv_pow2_bitmask@var{m2}}
> > +Unsigned vector division by an immediate that is equivalent to
> > +@samp{2^(bitsize(m) / 2) - 1}.
> > +@smallexample
> > +unsigned short op0; op1;
> > +@dots{}
> > +op0 = op1 / 0xffU;
> > +@end smallexample
> > +
> >  @cindex @code{vec_shl_insert_@var{m}} instruction pattern
> >  @item @samp{vec_shl_insert_@var{m}}
> >  Shift the elements in vector input operand 1 left one element (i.e.@:
> > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > index 
> > d2d550d358606022b1cb44fa842f06e0be507bc3..a3e3cc1520f77683ebf6256898f916ed45de475f
> >  100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -159,6 +159,8 @@ DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | 
> > ECF_NOTHROW,
> >                    vec_shl_insert, binary)
> >  
> >  DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST | ECF_NOTHROW, sdiv_pow2, 
> > binary)
> > +DEF_INTERNAL_OPTAB_FN (DIV_POW2_BITMASK, ECF_CONST | ECF_NOTHROW,
> > +                  udiv_pow2_bitmask, unary)
> >  
> >  DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
> >  DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary)
> > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > index 
> > 801310ebaa7d469520809bb7efed6820f8eb866b..3f0ac05ef5ad5aed8d6ca391f4eed71b0494e17f
> >  100644
> > --- a/gcc/optabs.def
> > +++ b/gcc/optabs.def
> > @@ -372,6 +372,7 @@ OPTAB_D (smulhrs_optab, "smulhrs$a3")
> >  OPTAB_D (umulhs_optab, "umulhs$a3")
> >  OPTAB_D (umulhrs_optab, "umulhrs$a3")
> >  OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3")
> > +OPTAB_D (udiv_pow2_bitmask_optab, "udiv_pow2_bitmask$a2")
> >  OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
> >  OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")
> >  OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a")
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c 
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > new file mode 100644
> > index 
> > 0000000000000000000000000000000000000000..a7ea3cce4764239c5d281a8f0bead1f6a452de3f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint8_t 
> > +
> > +__attribute__((noipa, noinline, optimize("O1")))
> > +void fun1(TYPE* restrict pixel, TYPE level, int n)
> > +{
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff;
> > +}
> > +
> > +__attribute__((noipa, noinline, optimize("O3")))
> > +void fun2(TYPE* restrict pixel, TYPE level, int n)
> > +{
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff;
> > +}
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" 
> > "vect" } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c 
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > new file mode 100644
> > index 
> > 0000000000000000000000000000000000000000..009e16e1b36497e5724410d9843f1ce122b26dda
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint16_t 
> > +
> > +__attribute__((noipa, noinline, optimize("O1")))
> > +void fun1(TYPE* restrict pixel, TYPE level, int n)
> > +{
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU;
> > +}
> > +
> > +__attribute__((noipa, noinline, optimize("O3")))
> > +void fun2(TYPE* restrict pixel, TYPE level, int n)
> > +{
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU;
> > +}
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" 
> > "vect" } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c 
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > new file mode 100644
> > index 
> > 0000000000000000000000000000000000000000..bf35a0bda8333c418e692d94220df849cc47930b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-require-effective-target vect_int } */
> > +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } 
> > } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint32_t 
> > +
> > +__attribute__((noipa, noinline, optimize("O1")))
> > +void fun1(TYPE* restrict pixel, TYPE level, int n)
> > +{
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> > +}
> > +
> > +__attribute__((noipa, noinline, optimize("O3")))
> > +void fun2(TYPE* restrict pixel, TYPE level, int n)
> > +{
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> > +}
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" 
> > "vect" } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h 
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > new file mode 100644
> > index 
> > 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > @@ -0,0 +1,43 @@
> > +#include <stdio.h>
> > +
> > +#ifndef N
> > +#define N 65
> > +#endif
> > +
> > +#ifndef TYPE
> > +#define TYPE uint32_t
> > +#endif
> > +
> > +#ifndef DEBUG
> > +#define DEBUG 0
> > +#endif
> > +
> > +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> > +
> > +int main ()
> > +{
> > +  TYPE a[N];
> > +  TYPE b[N];
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      a[i] = BASE + i * 13;
> > +      b[i] = BASE + i * 13;
> > +      if (DEBUG)
> > +        printf ("%d: 0x%x\n", i, a[i]);
> > +    }
> > +
> > +  fun1 (a, N / 2, N);
> > +  fun2 (b, N / 2, N);
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      if (DEBUG)
> > +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> > +
> > +      if (a[i] != b[i])
> > +        __builtin_abort ();
> > +    }
> > +  return 0;
> > +}
> > +
> > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> > index 
> > 217bdfd7045a22578a35bb891a4318d741071872..a738558cb8d12296bff462d716310ca8d82957b5
> >  100644
> > --- a/gcc/tree-vect-patterns.cc
> > +++ b/gcc/tree-vect-patterns.cc
> > @@ -3558,6 +3558,33 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> >  
> >        return pattern_stmt;
> >      }
> > +  else if ((TYPE_UNSIGNED (itype) || tree_int_cst_sgn (oprnd1) != 1)
> > +      && rhs_code != TRUNC_MOD_EXPR)
> > +    {
> > +      wide_int icst = wi::to_wide (oprnd1);
> > +      wide_int val = wi::add (icst, 1);
> > +      int pow = wi::exact_log2 (val);
> > +      if (pow == (prec / 2))
> > +   {
> > +     /* Pattern detected.  */
> > +     vect_pattern_detected ("vect_recog_divmod_pattern", last_stmt);
> > +
> > +     *type_out = vectype;
> > +
> > +     /* Check if the target supports this internal function.  */
> > +     internal_fn ifn = IFN_DIV_POW2_BITMASK;
> > +     if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
> > +       {
> > +         tree var_div = vect_recog_temp_ssa_var (itype, NULL);
> > +         gimple *div_stmt = gimple_build_call_internal (ifn, 1, oprnd0);
> > +         gimple_call_set_lhs (div_stmt, var_div);
> > +
> > +         gimple_set_location (div_stmt, gimple_location (last_stmt));
> > +
> > +         return div_stmt;
> > +       }
> > +   }
> > +    }
> >  
> >    if (prec > HOST_BITS_PER_WIDE_INT
> >        || integer_zerop (oprnd1))
> > 
> > 
> > 
> > 
> > 
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstraße 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

Re: [PATCH 1/2]middle-end Support optimized division by pow2 bitmask

Reply via email to