Certain permute that blends a vector with zero can be interpreted as an AND of a
mask. This idea was suggested by Richard Sandiford when he was reviewing my
patch which tries to optimizes certain vector permute with the FMOV instruction
for the aarch64 target. Canonicalizing this class of vector permute as AND can
be more general and potentially benefit more targets.

For example, for the aarch64 target, at present:

v4hi
f_v4hi (v4hi x)
{
  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
}

generates:

f_v4hi:
        uzp1    v0.2d, v0.2d, v0.2d
        adrp    x0, .LC0
        ldr     d31, [x0, #:lo12:.LC0]
        tbl     v0.8b, {v0.16b}, v31.8b
        ret
.LC0:
        .byte   -1
        .byte   -1
        .byte   2
        .byte   3
        .byte   -1
        .byte   -1
        .byte   6
        .byte   7

With this patch, it generates:

f_v4hi:
        mvni    v31.2s, 0xff, msl 8
        and     v0.8b, v0.8b, v31.8b
        ret

However, we do have to xfail a few i386 tests due to the new canonicalization
this patch introduces and PR119922 has been filed to track these regressions.

        PR target/100165

gcc/ChangeLog:

        * optabs.cc (vec_perm_and_mask): New function.
        (expand_vec_perm_const): Add new AND canonicalization.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/avx-pr94680.c: XFAIL.
        * gcc.target/i386/avx10_2-vmovd-1.c: Likewise.
        * gcc.target/i386/avx10_2-vmovw-1.c: Likewise.
        * gcc.target/i386/avx512f-pr94680.c: Likewise.
        * gcc.target/i386/avx512fp16-pr94680.c: Likewise.
        * gcc.target/i386/sse2-pr94680.c: Likewise.
        * gcc.target/aarch64/and-be.c: New test.
        * gcc.target/aarch64/and.c: New test.

Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com>
---
 gcc/optabs.cc                                 |  69 +++++++++-
 gcc/testsuite/gcc.target/aarch64/and-be.c     | 125 ++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/and.c        | 125 ++++++++++++++++++
 gcc/testsuite/gcc.target/i386/avx-pr94680.c   |   3 +-
 .../gcc.target/i386/avx10_2-vmovd-1.c         |   3 +-
 .../gcc.target/i386/avx10_2-vmovw-1.c         |   3 +-
 .../gcc.target/i386/avx512f-pr94680.c         |   3 +-
 .../gcc.target/i386/avx512fp16-pr94680.c      |   3 +-
 gcc/testsuite/gcc.target/i386/sse2-pr94680.c  |   3 +-
 9 files changed, 330 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/and-be.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/and.c

diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 0a14b1eef8a..dca9df42673 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -6384,6 +6384,50 @@ expand_vec_perm_1 (enum insn_code icode, rtx target,
   return NULL_RTX;
 }
 
+/* Check if vec_perm mask SEL is a constant equivalent to an and operation of
+   the non-zero vec_perm operand with some mask consisting of 0xffs and 0x00s,
+   assuming the other vec_perm operand is a constant vector of zeros.  Return
+   the mask for the equivalent and operation, or NULL_RTX if the vec_perm can
+   not be modeled as an and.  MODE is the mode of the value being anded.
+   ZERO_OP0_P is true if the first operand of the vec_perm is a constant vector
+   of zeros or false if the second operand of the vec_perm is a constant vector
+   of zeros.  */
+static rtx
+vec_perm_and_mask (machine_mode mode, const vec_perm_indices &sel,
+                  bool zero_op0_p)
+{
+  unsigned int nelt;
+  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
+    return NULL_RTX;
+
+  rtx_vector_builder builder (mode, nelt, 1);
+  machine_mode emode = GET_MODE_INNER (mode);
+
+  for (unsigned int i = 0; i < nelt; i++)
+    {
+      if (!zero_op0_p)
+       {
+         if (known_eq (sel[i], i))
+           builder.quick_push (CONSTM1_RTX (emode));
+         else if (known_ge (sel[i], nelt))
+           builder.quick_push (CONST0_RTX (emode));
+         else
+           return NULL_RTX;
+       }
+      else
+       {
+         if (known_eq (sel[i], nelt + i))
+           builder.quick_push (CONSTM1_RTX (emode));
+         else if (known_lt (sel[i], nelt))
+           builder.quick_push (CONST0_RTX (emode));
+         else
+           return NULL_RTX;
+       }
+    }
+
+  return builder.build ();
+}
+
 /* Implement a permutation of vectors v0 and v1 using the permutation
    vector in SEL and return the result.  Use TARGET to hold the result
    if nonnull and convenient.
@@ -6422,12 +6466,18 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx 
v1,
   insn_code shift_code_qi = CODE_FOR_nothing;
   optab shift_optab = unknown_optab;
   rtx v2 = v0;
+  bool zero_op0_p = false;
+  bool zero_op1_p = false;
   if (v1 == CONST0_RTX (GET_MODE (v1)))
-    shift_optab = vec_shr_optab;
+    {
+      shift_optab = vec_shr_optab;
+      zero_op1_p = true;
+    }
   else if (v0 == CONST0_RTX (GET_MODE (v0)))
     {
       shift_optab = vec_shl_optab;
       v2 = v1;
+      zero_op0_p = true;
     }
   if (shift_optab != unknown_optab)
     {
@@ -6463,6 +6513,23 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
            }
        }
     }
+  /* See if the vec_perm can be interpreted as an and operation.  We only do
+     this if one of the operands is all zeros.  */
+  if (sel_mode != BLKmode && (zero_op0_p || zero_op1_p))
+    {
+      insn_code and_code = optab_handler (and_optab, sel_mode);
+      rtx and_mask = vec_perm_and_mask (sel_mode, indices, zero_op0_p);
+      if (and_code != CODE_FOR_nothing && and_mask)
+       {
+         class expand_operand ops[3];
+         rtx tmp = gen_reg_rtx (sel_mode);
+         create_output_operand (&ops[0], tmp, sel_mode);
+         create_input_operand (&ops[1], gen_lowpart (sel_mode, v2), sel_mode);
+         create_input_operand (&ops[2], and_mask, sel_mode);
+         if (maybe_expand_insn (and_code, 3, ops))
+           return gen_lowpart (mode, ops[0].value);
+       }
+    }
 
   if (targetm.vectorize.vec_perm_const != NULL)
     {
diff --git a/gcc/testsuite/gcc.target/aarch64/and-be.c 
b/gcc/testsuite/gcc.target/aarch64/and-be.c
new file mode 100644
index 00000000000..8ed87949f0b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/and-be.c
@@ -0,0 +1,125 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbig-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+
+
+/*
+** f_v4hi:
+**     movi    v([0-9]+).2s, 0xff, msl 8
+**     and     v0.8b, v0.8b, v\1.8b
+**     ret
+*/
+v4hi
+f_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
+}
+
+/*
+** g_v4hi:
+**     mvni    v([0-9]+).2s, 0xff, msl 8
+**     and     v0.8b, v0.8b, v\1.8b
+**     ret
+*/
+v4hi
+g_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 });
+}
+
+/*
+** f_v8hi:
+**     adrp    x([0-9]+), .LC([0-9]+)
+**     ldr     q([0-9]+), \[x\1, #:lo12:.LC\2\]
+**     and     v0.16b, v0.16b, v\3.16b
+**     ret
+*/
+v8hi
+f_v8hi (v8hi x)
+{
+  return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+                           (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff00000000
+**     and     v0.16b, v0.16b, v\1.16b
+**     ret
+*/
+v4si
+f_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 });
+}
+
+/*
+** g_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff
+**     and     v0.16b, v0.16b, v\1.16b
+**     ret
+*/
+v4si
+g_v4si (v4si x)
+{
+  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 });
+}
+
+/*
+** h_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff
+**     and     v0.16b, v0.16b, v\1.16b
+**     ret
+*/
+v4si
+h_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 });
+}
+
+/*
+** f_v4sf:
+**     movi    v([0-9]+).2d, 0xffffffff00000000
+**     and     v0.16b, v\1.16b, v0.16b
+**     ret
+*/
+v4sf
+f_v4sf (v4sf x)
+{
+  return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 });
+}
+
+/*
+** f_v8qi:
+**     movi    d([0-9]+), 0xff00ff00ff000000
+**     and     v0.8b, v0.8b, v\1.8b
+**     ret
+*/
+v8qi
+f_v8qi (v8qi x)
+{
+  return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+                           (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v16qi:
+**     adrp    x([0-9]+), .LC([0-9]+)
+**     ldr     q([0-9]+), \[x\1, #:lo12:.LC\2\]
+**     and     v0.16b, v0.16b, v\3.16b
+**     ret
+*/
+v16qi
+f_v16qi (v16qi x)
+{
+  return __builtin_shuffle (
+      x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/and.c 
b/gcc/testsuite/gcc.target/aarch64/and.c
new file mode 100644
index 00000000000..56586612b6e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/and.c
@@ -0,0 +1,125 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+
+
+/*
+** f_v4hi:
+**     mvni    v([0-9]+).2s, 0xff, msl 8
+**     and     v0.8b, v0.8b, v\1.8b
+**     ret
+*/
+v4hi
+f_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
+}
+
+/*
+** g_v4hi:
+**     movi    v([0-9]+).2s, 0xff, msl 8
+**     and     v0.8b, v0.8b, v\1.8b
+**     ret
+*/
+v4hi
+g_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 });
+}
+
+/*
+** f_v8hi:
+**     adrp    x([0-9]+), .LC([0-9]+)
+**     ldr     q([0-9]+), \[x\1, #:lo12:.LC\2\]
+**     and     v0.16b, v0.16b, v\3.16b
+**     ret
+*/
+v8hi
+f_v8hi (v8hi x)
+{
+  return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+                           (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff
+**     and     v0.16b, v0.16b, v\1.16b
+**     ret
+*/
+v4si
+f_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 });
+}
+
+/*
+** g_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff00000000
+**     and     v0.16b, v0.16b, v\1.16b
+**     ret
+*/
+v4si
+g_v4si (v4si x)
+{
+  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 });
+}
+
+/*
+** h_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff00000000
+**     and     v0.16b, v0.16b, v\1.16b
+**     ret
+*/
+v4si
+h_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 });
+}
+
+/*
+** f_v4sf:
+**     movi    v([0-9]+).2d, 0xffffffff
+**     and     v0.16b, v\1.16b, v0.16b
+**     ret
+*/
+v4sf
+f_v4sf (v4sf x)
+{
+  return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 });
+}
+
+/*
+** f_v8qi:
+**     movi    d([0-9]+), 0xff00ff00ff
+**     and     v0.8b, v0.8b, v\1.8b
+**     ret
+*/
+v8qi
+f_v8qi (v8qi x)
+{
+  return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+                           (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v16qi:
+**     adrp    x([0-9]+), .LC([0-9]+)
+**     ldr     q([0-9]+), \[x\1, #:lo12:.LC\2\]
+**     and     v0.16b, v0.16b, v\3.16b
+**     ret
+*/
+v16qi
+f_v16qi (v16qi x)
+{
+  return __builtin_shuffle (
+      x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 });
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx-pr94680.c 
b/gcc/testsuite/gcc.target/i386/avx-pr94680.c
index cb5041b6af3..4dc5315265a 100644
--- a/gcc/testsuite/gcc.target/i386/avx-pr94680.c
+++ b/gcc/testsuite/gcc.target/i386/avx-pr94680.c
@@ -1,6 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx -mno-avx512f -O2" } */
-/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%xmm[0-9]} 12 } } 
*/
+/* xfailed due to PR target/119922 */
+/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%xmm[0-9]} 12 { 
xfail *-*-* } } } */
 /* { dg-final { scan-assembler-not "pxor" } } */
 
 typedef float v8sf __attribute__((vector_size(32)));
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c
index 21bd1a1ef0a..593906bf36e 100644
--- a/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-vmovd-1.c
@@ -4,7 +4,8 @@
 /* { dg-final { scan-assembler-times "vmovss\t\[0-9\]+\\(%e\[bs\]p\\), %xmm0" 
1 { target ia32 } } } */
 /* { dg-final { scan-assembler-times "vmovd\t%xmm0, %xmm0" 3 { target ia32 } } 
} */
 /* { dg-final { scan-assembler-times "vmovd\t%edi, %xmm0" 1 { target { ! ia32 
} } } } */
-/* { dg-final { scan-assembler-times "vmovd\t%xmm0, %xmm0" 4 { target { ! ia32 
} } } } */
+/* xfailed due to PR target/119922 */
+/* { dg-final { scan-assembler-times "vmovd\t%xmm0, %xmm0" 4 { target { ! ia32 
} xfail *-*-* } } } */
 
 
 #include<immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c
index 49fa51dc2ec..cb30a682260 100644
--- a/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-vmovw-1.c
@@ -3,7 +3,8 @@
 /* { dg-final { scan-assembler-times "vmovw\t\[0-9\]+\\(%e\[bs\]p\\), %xmm0" 4 
{ target ia32 } } } */
 /* { dg-final { scan-assembler-times "vmovw\t%xmm0, %xmm0" 4 { target ia32 } } 
} */
 /* { dg-final { scan-assembler-times "vmovw\t%edi, %xmm0" 1 { target { ! ia32 
} } } } */
-/* { dg-final { scan-assembler-times "vmovw\t%xmm0, %xmm0" 7 { target { ! ia32 
} } } } */
+/* xfailed due to PR target/119922 */
+/* { dg-final { scan-assembler-times "vmovw\t%xmm0, %xmm0" 7 { target { ! ia32 
} xfail *-*-* } } } */
 
 #include<immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c 
b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
index c27431aae72..af41b14ed7c 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr94680.c
@@ -1,6 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -mavx512vbmi -O2" } */
-/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 12} } 
*/
+/* xfailed due to PR target/119922 */
+/* { dg-final { scan-assembler-times {(?n)vmov[a-z0-9]*[ \t]*%ymm[0-9]} 12 { 
xfail *-*-* } } } */
 /* { dg-final { scan-assembler-not "pxor" } } */
 
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c
index bfe11236eef..631f26be9b5 100644
--- a/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-pr94680.c
@@ -1,7 +1,8 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512fp16 -mavx512vl -O2" } */
 /* { dg-final { scan-assembler-times "vmovdqa" 4 } } */
-/* { dg-final { scan-assembler-times "vmovq" 2 } } */
+/* xfailed due to PR target/119922 */
+/* { dg-final { scan-assembler-times "vmovq" 2 { xfail *-*-* } } } */
 
 typedef _Float16 v32hf __attribute__((vector_size (64)));
 typedef _Float16 v16hf __attribute__((vector_size (32)));
diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr94680.c 
b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c
index 7e0ff9f6bc7..84692410534 100644
--- a/gcc/testsuite/gcc.target/i386/sse2-pr94680.c
+++ b/gcc/testsuite/gcc.target/i386/sse2-pr94680.c
@@ -1,6 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-msse2 -mno-sse4.1 -O2" } */
-/* { dg-final { scan-assembler-times {(?n)(?:mov|psrldq).*%xmm[0-9]} 12 } } */
+/* xfailed due to PR target/119922 */
+/* { dg-final { scan-assembler-times {(?n)(?:mov|psrldq).*%xmm[0-9]} 12 { 
xfail *-*-* } } } */
 /* { dg-final { scan-assembler-not "pxor" } } */
 
 typedef float v4sf __attribute__((vector_size(16)));
-- 
2.17.1

Reply via email to