Hi All,

In plenty of image and video processing code it's common to modify pixel values
by a widening operation and then scale them back into range by dividing by 255.

This patch adds an optab to allow us to emit an optimized sequence when doing
an unsigned division that is equivalent to:

   x = y / (2 ^ (bitsize (y)/2)-1

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * internal-fn.def (DIV_POW2_BITMASK): New.
        * optabs.def (udiv_pow2_bitmask_optab): New.
        * doc/md.texi: Document it.
        * tree-vect-patterns.cc (vect_recog_divmod_pattern): Recognize pattern.

gcc/testsuite/ChangeLog:

        * gcc.dg/vect/vect-div-bitmask-1.c: New test.
        * gcc.dg/vect/vect-div-bitmask-2.c: New test.
        * gcc.dg/vect/vect-div-bitmask-3.c: New test.
        * gcc.dg/vect/vect-div-bitmask.h: New file.

--- inline copy of patch -- 
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 
f3619c505c025f158c2bc64756531877378b22e1..784c49d7d24cef7619e4d613f7b4f6e945866c38
 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5588,6 +5588,18 @@ signed op0, op1;
 op0 = op1 / (1 << imm);
 @end smallexample
 
+@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
+@item @samp{udiv_pow2_bitmask@var{m2}}
+@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
+@itemx @samp{udiv_pow2_bitmask@var{m2}}
+Unsigned vector division by an immediate that is equivalent to
+@samp{2^(bitsize(m) / 2) - 1}.
+@smallexample
+unsigned short op0; op1;
+@dots{}
+op0 = op1 / 0xffU;
+@end smallexample
+
 @cindex @code{vec_shl_insert_@var{m}} instruction pattern
 @item @samp{vec_shl_insert_@var{m}}
 Shift the elements in vector input operand 1 left one element (i.e.@:
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 
d2d550d358606022b1cb44fa842f06e0be507bc3..a3e3cc1520f77683ebf6256898f916ed45de475f
 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -159,6 +159,8 @@ DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | 
ECF_NOTHROW,
                       vec_shl_insert, binary)
 
 DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST | ECF_NOTHROW, sdiv_pow2, binary)
+DEF_INTERNAL_OPTAB_FN (DIV_POW2_BITMASK, ECF_CONST | ECF_NOTHROW,
+                      udiv_pow2_bitmask, unary)
 
 DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
 DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 
801310ebaa7d469520809bb7efed6820f8eb866b..3f0ac05ef5ad5aed8d6ca391f4eed71b0494e17f
 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -372,6 +372,7 @@ OPTAB_D (smulhrs_optab, "smulhrs$a3")
 OPTAB_D (umulhs_optab, "umulhs$a3")
 OPTAB_D (umulhrs_optab, "umulhrs$a3")
 OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3")
+OPTAB_D (udiv_pow2_bitmask_optab, "udiv_pow2_bitmask$a2")
 OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
 OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")
 OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a")
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..a7ea3cce4764239c5d281a8f0bead1f6a452de3f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } 
} */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..009e16e1b36497e5724410d9843f1ce122b26dda
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint16_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } 
} */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c 
b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
new file mode 100644
index 
0000000000000000000000000000000000000000..bf35a0bda8333c418e692d94220df849cc47930b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } 
} */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h 
b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
new file mode 100644
index 
0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 
217bdfd7045a22578a35bb891a4318d741071872..a738558cb8d12296bff462d716310ca8d82957b5
 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3558,6 +3558,33 @@ vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if ((TYPE_UNSIGNED (itype) || tree_int_cst_sgn (oprnd1) != 1)
+          && rhs_code != TRUNC_MOD_EXPR)
+    {
+      wide_int icst = wi::to_wide (oprnd1);
+      wide_int val = wi::add (icst, 1);
+      int pow = wi::exact_log2 (val);
+      if (pow == (prec / 2))
+       {
+         /* Pattern detected.  */
+         vect_pattern_detected ("vect_recog_divmod_pattern", last_stmt);
+
+         *type_out = vectype;
+
+         /* Check if the target supports this internal function.  */
+         internal_fn ifn = IFN_DIV_POW2_BITMASK;
+         if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
+           {
+             tree var_div = vect_recog_temp_ssa_var (itype, NULL);
+             gimple *div_stmt = gimple_build_call_internal (ifn, 1, oprnd0);
+             gimple_call_set_lhs (div_stmt, var_div);
+
+             gimple_set_location (div_stmt, gimple_location (last_stmt));
+
+             return div_stmt;
+           }
+       }
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))




-- 
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 
f3619c505c025f158c2bc64756531877378b22e1..784c49d7d24cef7619e4d613f7b4f6e945866c38
 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5588,6 +5588,18 @@ signed op0, op1;
 op0 = op1 / (1 << imm);
 @end smallexample
 
+@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
+@item @samp{udiv_pow2_bitmask@var{m2}}
+@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
+@itemx @samp{udiv_pow2_bitmask@var{m2}}
+Unsigned vector division by an immediate that is equivalent to
+@samp{2^(bitsize(m) / 2) - 1}.
+@smallexample
+unsigned short op0; op1;
+@dots{}
+op0 = op1 / 0xffU;
+@end smallexample
+
 @cindex @code{vec_shl_insert_@var{m}} instruction pattern
 @item @samp{vec_shl_insert_@var{m}}
 Shift the elements in vector input operand 1 left one element (i.e.@:
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 
d2d550d358606022b1cb44fa842f06e0be507bc3..a3e3cc1520f77683ebf6256898f916ed45de475f
 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -159,6 +159,8 @@ DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | 
ECF_NOTHROW,
                       vec_shl_insert, binary)
 
 DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST | ECF_NOTHROW, sdiv_pow2, binary)
+DEF_INTERNAL_OPTAB_FN (DIV_POW2_BITMASK, ECF_CONST | ECF_NOTHROW,
+                      udiv_pow2_bitmask, unary)
 
 DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
 DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 
801310ebaa7d469520809bb7efed6820f8eb866b..3f0ac05ef5ad5aed8d6ca391f4eed71b0494e17f
 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -372,6 +372,7 @@ OPTAB_D (smulhrs_optab, "smulhrs$a3")
 OPTAB_D (umulhs_optab, "umulhs$a3")
 OPTAB_D (umulhrs_optab, "umulhrs$a3")
 OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3")
+OPTAB_D (udiv_pow2_bitmask_optab, "udiv_pow2_bitmask$a2")
 OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
 OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")
 OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a")
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..a7ea3cce4764239c5d281a8f0bead1f6a452de3f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } 
} */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..009e16e1b36497e5724410d9843f1ce122b26dda
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint16_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } 
} */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c 
b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
new file mode 100644
index 
0000000000000000000000000000000000000000..bf35a0bda8333c418e692d94220df849cc47930b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } 
} */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h 
b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
new file mode 100644
index 
0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 
217bdfd7045a22578a35bb891a4318d741071872..a738558cb8d12296bff462d716310ca8d82957b5
 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3558,6 +3558,33 @@ vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if ((TYPE_UNSIGNED (itype) || tree_int_cst_sgn (oprnd1) != 1)
+          && rhs_code != TRUNC_MOD_EXPR)
+    {
+      wide_int icst = wi::to_wide (oprnd1);
+      wide_int val = wi::add (icst, 1);
+      int pow = wi::exact_log2 (val);
+      if (pow == (prec / 2))
+       {
+         /* Pattern detected.  */
+         vect_pattern_detected ("vect_recog_divmod_pattern", last_stmt);
+
+         *type_out = vectype;
+
+         /* Check if the target supports this internal function.  */
+         internal_fn ifn = IFN_DIV_POW2_BITMASK;
+         if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
+           {
+             tree var_div = vect_recog_temp_ssa_var (itype, NULL);
+             gimple *div_stmt = gimple_build_call_internal (ifn, 1, oprnd0);
+             gimple_call_set_lhs (div_stmt, var_div);
+
+             gimple_set_location (div_stmt, gimple_location (last_stmt));
+
+             return div_stmt;
+           }
+       }
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))



Reply via email to