[PATCH] Optimize 128-bit vector permutation with pand, pandn and por.

Cui, Lili Wed, 20 Nov 2024 04:02:52 -0800

Hi, all

This patch aims to handle certain vector shuffle operations using pand, pandn 
and por more efficiently.


Bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?

Regards,
Lili.


This patch introduces a new subroutine in ix86_expand_vec_perm_const_1.
On x86, use mixed constant permutation for V8HImode and V16QImode when
SSE2 is supported. This patch handles certain vector shuffle operations
more efficiently using pand, pandn and por. This change is intended to
improve assembly code generation for configurations that support SSE2.

gcc/ChangeLog:

        PR target/116675
        * config/i386/i386-expand.cc (expand_vec_perm_pand_pandn_por):
        New subroutine.
        (ix86_expand_vec_perm_const_1): Call expand_vec_perm_pand_pandn_por.

gcc/testsuite/ChangeLog:

        PR target/116675
        * gcc.target/i386/pr116675.c: New test.
---
 gcc/config/i386/i386-expand.cc           | 50 ++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr116675.c | 75 ++++++++++++++++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr116675.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index a6e6e738a52..f9fa0281298 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -23103,6 +23103,53 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct 
expand_vec_perm_d *d)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement a
+   permutation (which is a bland) with and, andnot and or when pshufb is not 
available.
+
+   It handles case:
+   __builtin_shufflevector (v1, v2, 0, 9, 2, 11, 4, 13, 6, 15);
+   __builtin_shufflevector (v1, v2, 8, 1, 2, 11, 4, 13, 6, 15);
+
+   An element[i] must be chosen between op0[i] and op1[i] to satisfy the
+   requirement.
+ */
+
+static bool
+expand_vec_perm_pand_pandn_por (struct expand_vec_perm_d *d)
+{
+  rtx rperm[16], vperm;
+  unsigned int i, nelt = d->nelt;
+
+  if (!TARGET_SSE2
+      || d->one_operand_p
+      || (d->vmode != V16QImode && d->vmode != V8HImode))
+    return false;
+
+  if (d->perm[0] != 0)
+    return false;
+
+  /* The dest[i] must select an element between op0[i] and op1[i].  */
+  for (i = 1; i < nelt; i++)
+    if ((d->perm[i] % nelt) != i)
+      return false;
+
+  if (d->testing_p)
+     return true;
+
+  /* Generates a blend mask for the operators AND and ANDNOT.  */
+  machine_mode inner_mode = GET_MODE_INNER (d->vmode);
+  for (i = 0; i < nelt; i++)
+    rperm[i] = (d->perm[i] <  nelt) ? CONSTM1_RTX (inner_mode)
+      : CONST0_RTX (inner_mode);
+
+  vperm = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (nelt, rperm));
+  vperm = force_reg (d->vmode, vperm);
+
+  ix86_expand_sse_movcc (d->target, vperm, d->op0, d->op1);
+
+  return true;
+}
+
 /* Implement permutation with pslldq + psrldq + por when pshufb is not
    available.  */
 static bool
@@ -24162,6 +24209,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d 
*d)
   if (expand_vec_perm_psrlw_psllw_por (d))
     return true;
 
+  if (expand_vec_perm_pand_pandn_por (d))
+    return true;
+
   /* Try sequences of four instructions.  */
 
   if (expand_vec_perm_even_odd_trunc (d))
diff --git a/gcc/testsuite/gcc.target/i386/pr116675.c 
b/gcc/testsuite/gcc.target/i386/pr116675.c
new file mode 100644
index 00000000000..e463dd8415f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116675.c
@@ -0,0 +1,75 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2 -mno-ssse3" } */
+/* { dg-final { scan-assembler-times "pand" 4 } } */
+/* { dg-final { scan-assembler-times "pandn" 4 } } */
+/* { dg-final { scan-assembler-times "por" 4 } } */
+
+#include <emmintrin.h>
+
+__attribute__((noinline, noclone, target("sse2")))
+static __v8hi foo1 (__v8hi a, __v8hi b)
+{
+  return __builtin_shufflevector (a, b, 0, 9, 2, 11, 4, 13, 6, 15);
+}
+
+__attribute__((noinline, noclone, target("sse2")))
+static __v8hi foo2 (__v8hi a, __v8hi b)
+{
+  return __builtin_shufflevector (a, b, 8, 9, 2, 3, 4, 13, 14, 15);
+}
+
+__attribute__((noinline, noclone, target("sse2")))
+static __v16qi foo3 (__v16qi a, __v16qi b)
+{
+  return __builtin_shufflevector (a, b, 0, 17, 2, 19, 4, 21, 6, 23,
+                                 8, 25, 10, 27, 12, 29, 14, 31);
+}
+
+__attribute__((noinline, noclone, target("sse2")))
+static __v16qi foo4 (__v16qi a, __v16qi b)
+{
+  return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 21, 6, 23,
+                                        8, 25, 10, 27,12,29,14,31);
+}
+
+__attribute__((noinline, noclone)) void
+compare_v8hi (__v8hi a,  __v8hi b)
+{
+  for (int i = 0; i < 8; i++) 
+    if (a[i] != b[i]) 
+      __builtin_abort ();
+}
+
+__attribute__((noinline, noclone)) void
+compare_v16qi (__v16qi a,  __v16qi b)
+{
+  for (int i = 0; i < 16; i++)
+    if (a[i] != b[i])
+      __builtin_abort ();
+}
+
+int main (void)
+{
+  __v8hi s1, s2, s3, s4, s5, s6;
+  __v16qi s7, s8, s9, s10, s11, s12;
+  s1 = (__v8hi) {0, 1, 2, 3, 4, 5, 6, 7};
+  s2 = (__v8hi) {8, 9, 10, 11, 12, 13, 14, 15};
+  s7 = (__v16qi) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  s8 = (__v16qi) {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 
31};
+
+  s3  = foo1 (s1, s2);
+  s4  = foo2 (s1, s2);
+  s9  = foo3 (s7, s8);
+  s10 = foo4 (s7, s8);
+
+  s5 = (__v8hi) {0, 9, 2, 11, 4, 13, 6, 15};
+  s6 = (__v8hi) {8, 9, 2, 3, 4, 13, 14, 15};
+  s11 = (__v16qi) {0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31};
+  s12 = (__v16qi) {0, 1, 2, 3, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31};
+
+  compare_v8hi (s3, s5);
+  compare_v8hi (s4, s6);
+  compare_v16qi (s9, s11);
+  compare_v16qi (s10, s12);
+  return 0;
+}
-- 
2.34.1

[PATCH] Optimize 128-bit vector permutation with pand, pandn and por.

Reply via email to