Hi All,

This optimizes the case where a mask Y which fulfills ~Y + 1 == pow2 is used to
clear a some bits and then compared against 0 into one without the masking and
a compare against a different bit immediate.

We can do this for all unsigned compares and for signed we can do it for
comparisons of EQ and NE:

(x & (~255)) == 0 becomes x <= 255. Which for leaves it to the target to
optimally deal with the comparison.

This transformation has to be done in the mid-end because in RTL you don't have
the signs of the comparison operands and if the target needs an immediate this
should be floated outside of the loop.

The RTL loop invariant hoisting is done before split1.

i.e.

void fun1(int32_t *x, int n)
{
    for (int i = 0; i < (n & -16); i++)
      x[i] = (x[i]&(~255)) == 0;
}

now generates:

.L3:
        ldr     q0, [x0]
        cmhs    v0.4s, v2.4s, v0.4s
        and     v0.16b, v1.16b, v0.16b
        str     q0, [x0], 16
        cmp     x0, x1
        bne     .L3

and floats the immediate out of the loop.

instead of:

.L3:
        ldr     q0, [x0]
        bic     v0.4s, #255
        cmeq    v0.4s, v0.4s, #0
        and     v0.16b, v1.16b, v0.16b
        str     q0, [x0], 16
        cmp     x0, x1
        bne     .L3

Bootstrapped Regtested on aarch64-none-linux-gnu,
x86_64-pc-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * match.pd: New bitmask compare pattern.

gcc/testsuite/ChangeLog:

        * gcc.dg/bic-bitmask-10.c: New test.
        * gcc.dg/bic-bitmask-11.c: New test.
        * gcc.dg/bic-bitmask-12.c: New test.
        * gcc.dg/bic-bitmask-2.c: New test.
        * gcc.dg/bic-bitmask-3.c: New test.
        * gcc.dg/bic-bitmask-4.c: New test.
        * gcc.dg/bic-bitmask-5.c: New test.
        * gcc.dg/bic-bitmask-6.c: New test.
        * gcc.dg/bic-bitmask-7.c: New test.
        * gcc.dg/bic-bitmask-8.c: New test.
        * gcc.dg/bic-bitmask-9.c: New test.
        * gcc.dg/bic-bitmask.h: New test.
        * gcc.target/aarch64/bic-bitmask-1.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/match.pd b/gcc/match.pd
index 
0fcfd0ea62c043dc217d0d560ce5b7e569b70e7d..df9212cb27d172856b9d43b0875262f96e8993c4
 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -4288,6 +4288,56 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
     (if (ic == ncmp)
      (ncmp @0 @1))))))
 
+/* Transform comparisons of the form (X & Y) CMP 0 to X CMP2 Z
+   where ~Y + 1 == pow2 and Z = ~Y.  */
+(for cmp (simple_comparison)
+ (simplify
+  (cmp (bit_and:c @0 VECTOR_CST@1) integer_zerop)
+   (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@1))
+       && uniform_vector_p (@1))
+    (with { tree elt = vector_cst_elt (@1, 0); }
+     (switch
+      (if (TYPE_UNSIGNED (TREE_TYPE (@1)) && tree_fits_uhwi_p (elt))
+       (with { unsigned HOST_WIDE_INT diff = tree_to_uhwi (elt);
+               tree tdiff = wide_int_to_tree (TREE_TYPE (elt), (~diff) + 1);
+               tree newval = wide_int_to_tree (TREE_TYPE (elt), ~diff);
+               tree newmask = build_uniform_cst (TREE_TYPE (@1), newval); }
+        (if (integer_pow2p (tdiff))
+         (switch
+          /* ((mask & x) < 0) -> 0.  */
+          (if (cmp == LT_EXPR)
+           { build_zero_cst (TREE_TYPE (@1)); })
+          /* ((mask & x) <= 0) -> x < mask.  */
+          (if (cmp == LE_EXPR)
+           (lt @0 { newmask; }))
+          /* ((mask & x) == 0) -> x < mask.  */
+          (if (cmp == EQ_EXPR)
+           (le @0 { newmask; }))
+          /* ((mask & x) != 0) -> x > mask.  */
+          (if (cmp == NE_EXPR)
+           (gt @0 { newmask; }))
+          /* ((mask & x) >= 0) -> x <= mask.  */
+          (if (cmp == GE_EXPR)
+           (le @0 { newmask; }))
+           /* ((mask & x) > 0) -> x < mask.  */
+          (if (cmp == GT_EXPR)
+           (lt @0 { newmask; }))))))
+      (if (!TYPE_UNSIGNED (TREE_TYPE (@1)) && tree_fits_shwi_p (elt))
+       (with { unsigned HOST_WIDE_INT diff = tree_to_shwi (elt);
+               tree ustype = unsigned_type_for (TREE_TYPE (elt));
+               tree uvtype = unsigned_type_for (TREE_TYPE (@1));
+               tree tdiff = wide_int_to_tree (ustype, (~diff) + 1);
+               tree udiff = wide_int_to_tree (ustype, ~diff);
+               tree cst = build_uniform_cst (uvtype, udiff); }
+        (if (integer_pow2p (tdiff))
+         (switch
+           /* ((mask & x) == 0) -> x < mask.  */
+           (if (cmp == EQ_EXPR)
+            (le (convert:uvtype @0) { cst; }))
+           /* ((mask & x) != 0) -> x > mask.  */
+           (if (cmp == NE_EXPR)
+            (gt (convert:uvtype @0) { cst; })))))))))))
+
 /* Transform comparisons of the form X - Y CMP 0 to X CMP Y.
    ??? The transformation is valid for the other operators if overflow
    is undefined for the type, but performing it here badly interacts
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-10.c 
b/gcc/testsuite/gcc.dg/bic-bitmask-10.c
new file mode 100644
index 
0000000000000000000000000000000000000000..76a22a2313137a2a75dd711c2c15c2d3a34e15aa
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-10.c
@@ -0,0 +1,26 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+#define TYPE int32_t
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } 
} } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-11.c 
b/gcc/testsuite/gcc.dg/bic-bitmask-11.c
new file mode 100644
index 
0000000000000000000000000000000000000000..32553d7ba2f823f7a21237451990d0a216d2f912
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-11.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) != 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) != 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump {>\s*.+\{ 255,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } 
} } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-12.c 
b/gcc/testsuite/gcc.dg/bic-bitmask-12.c
new file mode 100644
index 
0000000000000000000000000000000000000000..e10cbf7fabe2dbf7ce436cdf37b0f8b207c58408
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-12.c
@@ -0,0 +1,17 @@
+/* { dg-do assemble } */
+/* { dg-options "-O3 -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+typedef unsigned int v4si __attribute__ ((vector_size (16)));
+
+__attribute__((noinline, noipa))
+void fun(v4si *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } 
} } } */
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-2.c 
b/gcc/testsuite/gcc.dg/bic-bitmask-2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..da30fad89f6c8239baa4395b3ffaec0be577e13f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-2.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } 
} } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-3.c 
b/gcc/testsuite/gcc.dg/bic-bitmask-3.c
new file mode 100644
index 
0000000000000000000000000000000000000000..da30fad89f6c8239baa4395b3ffaec0be577e13f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-3.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } 
} } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-4.c 
b/gcc/testsuite/gcc.dg/bic-bitmask-4.c
new file mode 100644
index 
0000000000000000000000000000000000000000..1bcf23ccf1447d6c8c999ed1eb25ba0a450028e1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-4.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) >= 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) >= 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-times {=\s*.+\{ 1,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } 
} } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-5.c 
b/gcc/testsuite/gcc.dg/bic-bitmask-5.c
new file mode 100644
index 
0000000000000000000000000000000000000000..6e5a2fca9992efbc01f8dbbc6f95936e86643028
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-5.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) > 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) > 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-times {>\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&`s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } 
} } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-6.c 
b/gcc/testsuite/gcc.dg/bic-bitmask-6.c
new file mode 100644
index 
0000000000000000000000000000000000000000..018e7a4348c9fc461106c3d9d01291325d3406c2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-6.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) <= 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) <= 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } 
} } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-7.c 
b/gcc/testsuite/gcc.dg/bic-bitmask-7.c
new file mode 100644
index 
0000000000000000000000000000000000000000..798678fb7555052c93abc4ca34f617d640f73bb4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-7.c
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) < 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) < 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-times {__builtin_memset} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } 
} } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-8.c 
b/gcc/testsuite/gcc.dg/bic-bitmask-8.c
new file mode 100644
index 
0000000000000000000000000000000000000000..1dabe834ed57dfa0be48c1dc3dbb226092c79a1a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-8.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) != 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) != 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-times {>\s*.+\{ 1,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967294,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } 
} } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-9.c 
b/gcc/testsuite/gcc.dg/bic-bitmask-9.c
new file mode 100644
index 
0000000000000000000000000000000000000000..9c1f8ee0adfc45d1b9fc212138ea26bb6b693e49
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-9.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~5)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~5)) == 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not {<=\s*.+\{ 4294967289,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } 
} } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask.h 
b/gcc/testsuite/gcc.dg/bic-bitmask.h
new file mode 100644
index 
0000000000000000000000000000000000000000..2b94065c025e0cbf71a21ac9b9d6314e24b0c2d9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask.h
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 50
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N);
+  fun2 (b, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bic-bitmask-1.c 
b/gcc/testsuite/gcc.target/aarch64/bic-bitmask-1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..568c1ffc8bc4148efaeeba7a45a75ecbd3a7a3dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bic-bitmask-1.c
@@ -0,0 +1,13 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -save-temps" } */
+
+#include <arm_neon.h>
+
+uint32x4_t foo (int32x4_t a)
+{
+  int32x4_t cst = vdupq_n_s32 (255);
+  int32x4_t zero = vdupq_n_s32 (0);
+  return vceqq_s32 (vbicq_s32 (a, cst), zero);
+}
+
+/* { dg-final { scan-assembler-not {\tbic\t} { xfail { aarch64*-*-* } } } } */


-- 
diff --git a/gcc/match.pd b/gcc/match.pd
index 0fcfd0ea62c043dc217d0d560ce5b7e569b70e7d..df9212cb27d172856b9d43b0875262f96e8993c4 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -4288,6 +4288,56 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
     (if (ic == ncmp)
      (ncmp @0 @1))))))
 
+/* Transform comparisons of the form (X & Y) CMP 0 to X CMP2 Z
+   where ~Y + 1 == pow2 and Z = ~Y.  */
+(for cmp (simple_comparison)
+ (simplify
+  (cmp (bit_and:c @0 VECTOR_CST@1) integer_zerop)
+   (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@1))
+	&& uniform_vector_p (@1))
+    (with { tree elt = vector_cst_elt (@1, 0); }
+     (switch
+      (if (TYPE_UNSIGNED (TREE_TYPE (@1)) && tree_fits_uhwi_p (elt))
+	(with { unsigned HOST_WIDE_INT diff = tree_to_uhwi (elt);
+	        tree tdiff = wide_int_to_tree (TREE_TYPE (elt), (~diff) + 1);
+		tree newval = wide_int_to_tree (TREE_TYPE (elt), ~diff);
+		tree newmask = build_uniform_cst (TREE_TYPE (@1), newval); }
+	 (if (integer_pow2p (tdiff))
+	  (switch
+	   /* ((mask & x) < 0) -> 0.  */
+	   (if (cmp == LT_EXPR)
+	    { build_zero_cst (TREE_TYPE (@1)); })
+	   /* ((mask & x) <= 0) -> x < mask.  */
+	   (if (cmp == LE_EXPR)
+	    (lt @0 { newmask; }))
+	   /* ((mask & x) == 0) -> x < mask.  */
+	   (if (cmp == EQ_EXPR)
+	    (le @0 { newmask; }))
+	   /* ((mask & x) != 0) -> x > mask.  */
+	   (if (cmp == NE_EXPR)
+	    (gt @0 { newmask; }))
+	   /* ((mask & x) >= 0) -> x <= mask.  */
+	   (if (cmp == GE_EXPR)
+	    (le @0 { newmask; }))
+	    /* ((mask & x) > 0) -> x < mask.  */
+	   (if (cmp == GT_EXPR)
+	    (lt @0 { newmask; }))))))
+      (if (!TYPE_UNSIGNED (TREE_TYPE (@1)) && tree_fits_shwi_p (elt))
+	(with { unsigned HOST_WIDE_INT diff = tree_to_shwi (elt);
+		tree ustype = unsigned_type_for (TREE_TYPE (elt));
+		tree uvtype = unsigned_type_for (TREE_TYPE (@1));
+	        tree tdiff = wide_int_to_tree (ustype, (~diff) + 1);
+	        tree udiff = wide_int_to_tree (ustype, ~diff);
+		tree cst = build_uniform_cst (uvtype, udiff); }
+	 (if (integer_pow2p (tdiff))
+	  (switch
+	    /* ((mask & x) == 0) -> x < mask.  */
+	    (if (cmp == EQ_EXPR)
+	     (le (convert:uvtype @0) { cst; }))
+	    /* ((mask & x) != 0) -> x > mask.  */
+	    (if (cmp == NE_EXPR)
+	     (gt (convert:uvtype @0) { cst; })))))))))))
+
 /* Transform comparisons of the form X - Y CMP 0 to X CMP Y.
    ??? The transformation is valid for the other operators if overflow
    is undefined for the type, but performing it here badly interacts
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-10.c b/gcc/testsuite/gcc.dg/bic-bitmask-10.c
new file mode 100644
index 0000000000000000000000000000000000000000..76a22a2313137a2a75dd711c2c15c2d3a34e15aa
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-10.c
@@ -0,0 +1,26 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(int32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+#define TYPE int32_t
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-11.c b/gcc/testsuite/gcc.dg/bic-bitmask-11.c
new file mode 100644
index 0000000000000000000000000000000000000000..32553d7ba2f823f7a21237451990d0a216d2f912
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-11.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) != 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) != 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump {>\s*.+\{ 255,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-12.c b/gcc/testsuite/gcc.dg/bic-bitmask-12.c
new file mode 100644
index 0000000000000000000000000000000000000000..e10cbf7fabe2dbf7ce436cdf37b0f8b207c58408
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-12.c
@@ -0,0 +1,17 @@
+/* { dg-do assemble } */
+/* { dg-options "-O3 -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+typedef unsigned int v4si __attribute__ ((vector_size (16)));
+
+__attribute__((noinline, noipa))
+void fun(v4si *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+/* { dg-final { scan-tree-dump {<=\s*.+\{ 255,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-2.c b/gcc/testsuite/gcc.dg/bic-bitmask-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..da30fad89f6c8239baa4395b3ffaec0be577e13f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-2.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-3.c b/gcc/testsuite/gcc.dg/bic-bitmask-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..da30fad89f6c8239baa4395b3ffaec0be577e13f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-3.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) == 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-4.c b/gcc/testsuite/gcc.dg/bic-bitmask-4.c
new file mode 100644
index 0000000000000000000000000000000000000000..1bcf23ccf1447d6c8c999ed1eb25ba0a450028e1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-4.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) >= 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) >= 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-times {=\s*.+\{ 1,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-5.c b/gcc/testsuite/gcc.dg/bic-bitmask-5.c
new file mode 100644
index 0000000000000000000000000000000000000000..6e5a2fca9992efbc01f8dbbc6f95936e86643028
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-5.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) > 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) > 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-times {>\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&`s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-6.c b/gcc/testsuite/gcc.dg/bic-bitmask-6.c
new file mode 100644
index 0000000000000000000000000000000000000000..018e7a4348c9fc461106c3d9d01291325d3406c2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-6.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) <= 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~255)) <= 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-times {<=\s*.+\{ 255,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967040,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-7.c b/gcc/testsuite/gcc.dg/bic-bitmask-7.c
new file mode 100644
index 0000000000000000000000000000000000000000..798678fb7555052c93abc4ca34f617d640f73bb4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-7.c
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) < 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) < 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-times {__builtin_memset} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-8.c b/gcc/testsuite/gcc.dg/bic-bitmask-8.c
new file mode 100644
index 0000000000000000000000000000000000000000..1dabe834ed57dfa0be48c1dc3dbb226092c79a1a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-8.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) != 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~1)) != 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-times {>\s*.+\{ 1,.+\}} 1 dce7 } } */
+/* { dg-final { scan-tree-dump-not {&\s*.+\{ 4294967294,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask-9.c b/gcc/testsuite/gcc.dg/bic-bitmask-9.c
new file mode 100644
index 0000000000000000000000000000000000000000..9c1f8ee0adfc45d1b9fc212138ea26bb6b693e49
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask-9.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fdump-tree-dce" } */
+
+#include <stdint.h>
+
+__attribute__((noinline, noipa))
+void fun1(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~5)) == 0;
+}
+
+__attribute__((noinline, noipa, optimize("O1")))
+void fun2(uint32_t *x, int n)
+{
+    for (int i = 0; i < (n & -16); i++)
+      x[i] = (x[i]&(~5)) == 0;
+}
+
+#include "bic-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not {<=\s*.+\{ 4294967289,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump {&\s*.+\{ 4294967290,.+\}} dce7 } } */
+/* { dg-final { scan-tree-dump-not {\s+bic\s+} dce7 { target { aarch64*-*-* } } } } */
+
diff --git a/gcc/testsuite/gcc.dg/bic-bitmask.h b/gcc/testsuite/gcc.dg/bic-bitmask.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b94065c025e0cbf71a21ac9b9d6314e24b0c2d9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bic-bitmask.h
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 50
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N);
+  fun2 (b, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bic-bitmask-1.c b/gcc/testsuite/gcc.target/aarch64/bic-bitmask-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..568c1ffc8bc4148efaeeba7a45a75ecbd3a7a3dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bic-bitmask-1.c
@@ -0,0 +1,13 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -save-temps" } */
+
+#include <arm_neon.h>
+
+uint32x4_t foo (int32x4_t a)
+{
+  int32x4_t cst = vdupq_n_s32 (255);
+  int32x4_t zero = vdupq_n_s32 (0);
+  return vceqq_s32 (vbicq_s32 (a, cst), zero);
+}
+
+/* { dg-final { scan-assembler-not {\tbic\t} { xfail { aarch64*-*-* } } } } */

Reply via email to