In LoongArch, when the permutation idx comes from different vectors and
idx is not repeated, for V8SI/V8SF/V4DI/V4DF type vectors, we can use
two xvperm.w + one xvbitsel.v instructions or two xvpermi.d + one
xvbitsel.v instructions for shuffle optimization.

gcc/ChangeLog:

        * config/loongarch/loongarch.cc 
(loongarch_expand_vec_perm_generic_bitsel):
        Add new vector shuffle optimize function.
        (loongarch_expand_vec_perm_const): Adjust.

gcc/testsuite/ChangeLog:

        * gcc.target/loongarch/vec_perm-xvbitsel-2.c: New test.
        * gcc.target/loongarch/vec_perm-xvbitsel-3.c: New test.
---
 gcc/config/loongarch/loongarch.cc             | 136 ++++++++++++++++++
 .../loongarch/vec_perm-xvbitsel-2.c           |  18 +++
 .../loongarch/vec_perm-xvbitsel-3.c           |  22 +++
 3 files changed, 176 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-2.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-3.c

diff --git a/gcc/config/loongarch/loongarch.cc 
b/gcc/config/loongarch/loongarch.cc
index a4a72923b7f..16fe755742b 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -9229,6 +9229,139 @@ loongarch_expand_vec_perm_bitsel (struct 
expand_vec_perm_d *d)
   return true;
 }
 
+/* A general shuffle method for 256-bit V8SI/V8SF/V4DI/V4DF types when
+   the permutate idx comes from different vectors and idx is not repeated.  */
+static bool
+loongarch_expand_vec_perm_generic_bitsel (struct expand_vec_perm_d *d)
+{
+  if (!ISA_HAS_LASX)
+    return false;
+
+  auto_bitmap used;
+  machine_mode mode = d->vmode;
+  int nelt = d->nelt, val, i;
+
+  /* Due to instruction set restrictions, the following types do not support
+     this optimization method.  */
+  if (mode != E_V8SImode && mode != E_V8SFmode
+      && mode != E_V4DImode && mode != E_V4DFmode)
+    return false;
+
+  /* We should ensure that d->perm[i] % nelt has no repeat.  */
+  for (i = 0; i < nelt; i += 1)
+    {
+      if (bitmap_bit_p (used, d->perm[i] % nelt))
+       return false;
+      else
+       bitmap_set_bit (used, d->perm[i] % nelt);
+    }
+
+  if (d->testing_p)
+    return true;
+
+  rtx reg_bitsel, tmp_bitsel, sel_bitsel, op0, op1;
+  rtx rmap_bitsel[MAX_VECT_LEN];
+  op0 = gen_reg_rtx (mode);
+  op1 = gen_reg_rtx (mode);
+  reg_bitsel = gen_reg_rtx (mode);
+
+  if (mode == E_V8SImode || mode == E_V8SFmode)
+    {
+      rtx rmap_xvperm[MAX_VECT_LEN];
+      rtx sel_xvperm, reg_xvperm;
+
+      for (i = 0; i < nelt; i += 1)
+       {
+         /* For xvperm insn we just copy original permutate index.  */
+         rmap_xvperm[i] = GEN_INT (d->perm[i]);
+         val = d->perm[i] >= nelt ? -1 : 0;
+         /* For xvbitsel insn we should do some conversion, where -1 means
+            the destination element comes from operand1, and 0 means the
+            destination element comes from operand0.  */
+         rmap_bitsel[i] = GEN_INT (val);
+       }
+
+      reg_xvperm = gen_reg_rtx (E_V8SImode);
+
+      /* Prepare reg of selective index for xvperm.  */
+      sel_xvperm = gen_rtx_CONST_VECTOR (E_V8SImode,
+                                        gen_rtvec_v (nelt, rmap_xvperm));
+      emit_move_insn (reg_xvperm, sel_xvperm);
+
+      /* Prepare reg of selective index for xvbitsel.  */
+      sel_bitsel = gen_rtx_CONST_VECTOR (E_V8SImode,
+                                        gen_rtvec_v (nelt, rmap_bitsel));
+      if (mode == E_V8SFmode)
+       {
+         tmp_bitsel = simplify_gen_subreg (E_V8SImode, reg_bitsel, mode, 0);
+         emit_move_insn (tmp_bitsel, sel_bitsel);
+       }
+      else
+       emit_move_insn (reg_bitsel, sel_bitsel);
+
+      switch (mode)
+       {
+       case E_V8SFmode:
+         emit_insn (gen_lasx_xvperm_w_f (op0, d->op0, reg_xvperm));
+         emit_insn (gen_lasx_xvperm_w_f (op1, d->op1, reg_xvperm));
+         break;
+       case E_V8SImode:
+         emit_insn (gen_lasx_xvperm_w (op0, d->op0, reg_xvperm));
+         emit_insn (gen_lasx_xvperm_w (op1, d->op1, reg_xvperm));
+         break;
+       default:
+         gcc_unreachable ();
+         break;
+       }
+
+      emit_insn (gen_simd_vbitsel (mode, d->target, op0, op1, reg_bitsel));
+    }
+  else
+    {
+      unsigned int imm = 0;
+      unsigned int val2;
+
+      for (i = nelt - 1; i >= 0; i -= 1)
+       {
+         val = d->perm[i] >= nelt ? -1 : 0;
+         rmap_bitsel[i] = GEN_INT (val);
+         val2 = d->perm[i] % nelt;
+         imm |= val2;
+         imm = (i != 0) ? imm << 2 : imm;
+       }
+
+      /* Prepare reg of selective index for xvbitsel.  */
+      sel_bitsel = gen_rtx_CONST_VECTOR (E_V4DImode,
+                                        gen_rtvec_v (nelt, rmap_bitsel));
+      if (mode == E_V4DFmode)
+       {
+         tmp_bitsel = simplify_gen_subreg (E_V4DImode, reg_bitsel, mode, 0);
+         emit_move_insn (tmp_bitsel, sel_bitsel);
+       }
+      else
+       emit_move_insn (reg_bitsel, sel_bitsel);
+
+      switch (mode)
+       {
+       case E_V4DFmode:
+         emit_insn (gen_lasx_xvpermi_d_v4df (op0, d->op0, GEN_INT (imm)));
+         emit_insn (gen_lasx_xvpermi_d_v4df (op1, d->op1, GEN_INT (imm)));
+         break;
+       case E_V4DImode:
+         emit_insn (gen_lasx_xvpermi_d_v4di (op0, d->op0, GEN_INT (imm)));
+         emit_insn (gen_lasx_xvpermi_d_v4di (op1, d->op1, GEN_INT (imm)));
+         break;
+       default:
+         gcc_unreachable ();
+         break;
+       }
+
+      emit_insn (gen_simd_vbitsel (mode, d->target, op0, op1, reg_bitsel));
+    }
+
+  return true;
+}
+
 /* Following are the assist function for const vector permutation support.  */
 static bool
 loongarch_is_quad_duplicate (struct expand_vec_perm_d *d)
@@ -9749,6 +9882,9 @@ loongarch_expand_vec_perm_const (struct expand_vec_perm_d 
*d)
          goto expand_perm_const_end;
        }
 
+      if (loongarch_expand_vec_perm_generic_bitsel (d))
+       return true;
+
 expand_perm_const_end:
       if (flag)
        {
diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-2.c 
b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-2.c
new file mode 100644
index 00000000000..3c38199126a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-2.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mlasx" } */
+/* { dg-final { scan-assembler "xvpermi.d" } } */
+/* { dg-final { scan-assembler-not "xvrepli.w" } } */
+/* { dg-final { scan-assembler-not "xvand.v" } } */
+/* { dg-final { scan-assembler-not "xvseq.w" } } */
+
+void
+foo (double a[], double b[], double c[])
+{
+  for (int i = 0; i < 800; i += 4)
+    {
+      c[i + 0] = a[i + 0] + b[i + 0];
+      c[i + 1] = a[i + 2] - b[i + 2];
+      c[i + 2] = a[i + 3] - b[i + 3];
+      c[i + 3] = a[i + 1] + b[i + 1];
+    }
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-3.c 
b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-3.c
new file mode 100644
index 00000000000..065c816a15d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vec_perm-xvbitsel-3.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mlasx" } */
+/* { dg-final { scan-assembler "xvperm.w" } } */
+/* { dg-final { scan-assembler-not "xvrepli.w" } } */
+/* { dg-final { scan-assembler-not "xvand.v" } } */
+/* { dg-final { scan-assembler-not "xvseq.w" } } */
+
+void
+foo (float a[], float b[], float c[])
+{
+  for (int i = 0; i < 800; i += 8)
+    {
+      c[i + 0] = a[i + 0] + b[i + 0];
+      c[i + 1] = a[i + 1] + b[i + 1];
+      c[i + 2] = a[i + 4] - b[i + 4];
+      c[i + 3] = a[i + 5] - b[i + 5];
+      c[i + 4] = a[i + 2] - b[i + 2];
+      c[i + 5] = a[i + 3] - b[i + 3];
+      c[i + 6] = a[i + 6] + b[i + 6];
+      c[i + 7] = a[i + 7] + b[i + 7];
+    }
+}
-- 
2.38.1

Reply via email to