Clamp the selector using the actual number of elements 2w instead of the
fixed value 0x1f.  So we can simply compare the clamped selector and w
to generate the mask for blending.

gcc/

        * config/loongarch/loongarch.cc (loongarch_expand_vec_perm_1):
        Clamp the selector using the twice of actual number of elements.
        Compare the clamped selector with the element number to get the
        blending mask.
---
 gcc/config/loongarch/loongarch.cc | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/gcc/config/loongarch/loongarch.cc 
b/gcc/config/loongarch/loongarch.cc
index 9e9d6cfc75f..4e32b23b6db 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -9095,10 +9095,13 @@ loongarch_expand_vec_perm_1 (rtx operands[])
   w = GET_MODE_NUNITS (mode);
 
   /* If we are using xvshuf.*, clamp the selector to avoid unpredictable
-     output.  */
-  if (maskmode != V8SImode && maskmode != V4DImode)
+     output; if we need to blend two shuf results for the final result,
+     also clamp it so we can use xvslei to generate the bitmask for
+     the blending.  */
+  if ((maskmode != V8SImode && maskmode != V4DImode)
+      || !one_operand_shuffle)
     {
-      rtx t = gen_const_vec_duplicate (maskmode, GEN_INT (0x1f));
+      rtx t = gen_const_vec_duplicate (maskmode, GEN_INT (2 * w - 1));
       mask = expand_binop (maskmode, and_optab, mask, t, NULL_RTX, false,
                           OPTAB_DIRECT);
     }
@@ -9211,18 +9214,13 @@ merge_two:
   /* Then merge them together.  The key is whether any given control
      element contained a bit set that indicates the second word.  */
   rtx xops[6];
-  mask = operands[3];
-  vt = GEN_INT (w);
-  vt = gen_const_vec_duplicate (maskmode, vt);
-  vt = force_reg (maskmode, vt);
-  mask = expand_simple_binop (maskmode, AND, mask, vt,
-                             NULL_RTX, 0, OPTAB_DIRECT);
+  vt = gen_const_vec_duplicate (maskmode, GEN_INT (w - 1));
   if (GET_MODE (target) != mode)
     target = gen_reg_rtx (mode);
   xops[0] = target;
-  xops[1] = gen_lowpart (mode, t2);
-  xops[2] = gen_lowpart (mode, t1);
-  xops[3] = gen_rtx_EQ (maskmode, mask, vt);
+  xops[1] = gen_lowpart (mode, t1);
+  xops[2] = gen_lowpart (mode, t2);
+  xops[3] = gen_rtx_LEU (maskmode, mask, vt);
   xops[4] = mask;
   xops[5] = vt;
 
-- 
2.51.2

Reply via email to