Hi!

On a testcase with 256 __builtin_shuffle V4DFmode calls I've counted
17 calls to expand_vselect{,_vconcat} on average for each __builtin_shuffle
call (some during testing, some during actual expansion, but that is
also often not successful).  This patch adjusts the code that for testing
it doesn't create new insns again and again, only if testing shows that such
an insn is supported, it calls emit_insn (copy_rtx ()) to emit it.

Bootstrapped/regtested on x86_64-linux and i686-linux, additionally tested
with
GCC_TEST_RUN_EXPENSIVE=1 make check-gcc 
RUNTESTFLAGS='--target_board=unix\{-m32/-mavx,-m64/-mavx\} 
dg-torture.exp=vshuf*'
Ok for trunk?

2012-03-20  Jakub Jelinek  <ja...@redhat.com>

        * config/i386/i386.c (vselect_insn): New variable.
        (init_vselect_insn): New function.
        (expand_vselect, expand_vselect_insn): Add testing_p argument.
        Call init_vselect_insn if vselect_insn is NULL.  Adjust
        PATTERN (vselect_insn), instead of creating a new insn each time,
        only emit a copy of it if not testing and recog has been successful.
        (expand_vec_perm_pshufb, expand_vec_perm_1,
        expand_vec_perm_pshuflw_pshufhw, expand_vec_perm_broadcast_1): Adjust
        callers.

--- gcc/config/i386/i386.c.jj   2012-03-19 18:10:20.000000000 +0100
+++ gcc/config/i386/i386.c      2012-03-20 08:46:59.751806243 +0100
@@ -35517,43 +35517,88 @@ ix86_builtin_vectorization_cost (enum ve
     }
 }
 
+/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
+   insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
+   insn every time.  */
+
+static GTY(()) rtx vselect_insn;
+
+/* Initialize vselect_insn.  */
+
+static void
+init_vselect_insn (void)
+{
+  unsigned i;
+  rtx x;
+
+  x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
+  for (i = 0; i < MAX_VECT_LEN; ++i)
+    XVECEXP (x, 0, i) = const0_rtx;
+  x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
+                                                       const0_rtx), x);
+  x = gen_rtx_SET (VOIDmode, const0_rtx, x);
+  start_sequence ();
+  vselect_insn = emit_insn (x);
+  end_sequence ();
+}
+
 /* Construct (set target (vec_select op0 (parallel perm))) and
    return true if that's a valid instruction in the active ISA.  */
 
 static bool
-expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
+expand_vselect (rtx target, rtx op0, const unsigned char *perm,
+               unsigned nelt, bool testing_p)
 {
-  rtx rperm[MAX_VECT_LEN], x;
-  unsigned i;
+  unsigned int i;
+  rtx x, save_vconcat;
+  int icode;
 
-  for (i = 0; i < nelt; ++i)
-    rperm[i] = GEN_INT (perm[i]);
+  if (vselect_insn == NULL_RTX)
+    init_vselect_insn ();
 
-  x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
-  x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
-  x = gen_rtx_SET (VOIDmode, target, x);
+  x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
+  PUT_NUM_ELEM (XVEC (x, 0), nelt);
+  for (i = 0; i < nelt; ++i)
+    XVECEXP (x, 0, i) = GEN_INT (perm[i]);
+  save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
+  XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
+  PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
+  SET_DEST (PATTERN (vselect_insn)) = target;
+  icode = recog_memoized (vselect_insn);
+
+  if (icode >= 0 && !testing_p)
+    emit_insn (copy_rtx (PATTERN (vselect_insn)));
+
+  SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
+  XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
+  INSN_CODE (vselect_insn) = -1;
 
-  x = emit_insn (x);
-  if (recog_memoized (x) < 0)
-    {
-      remove_insn (x);
-      return false;
-    }
-  return true;
+  return icode >= 0;
 }
 
 /* Similar, but generate a vec_concat from op0 and op1 as well.  */
 
 static bool
 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
-                       const unsigned char *perm, unsigned nelt)
+                       const unsigned char *perm, unsigned nelt,
+                       bool testing_p)
 {
   enum machine_mode v2mode;
   rtx x;
+  bool ok;
+
+  if (vselect_insn == NULL_RTX)
+    init_vselect_insn ();
 
   v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
-  x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
-  return expand_vselect (target, x, perm, nelt);
+  x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
+  PUT_MODE (x, v2mode);
+  XEXP (x, 0) = op0;
+  XEXP (x, 1) = op1;
+  ok = expand_vselect (target, x, perm, nelt, testing_p);
+  XEXP (x, 0) = const0_rtx;
+  XEXP (x, 1) = const0_rtx;
+  return ok;
 }
 
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
@@ -35885,7 +35930,7 @@ expand_vec_perm_pshufb (struct expand_ve
                    return true;
                  return expand_vselect (gen_lowpart (V4DImode, d->target),
                                         gen_lowpart (V4DImode, d->op0),
-                                        perm, 4);
+                                        perm, 4, false);
                }
 
              /* Next see if vpermd can be used.  */
@@ -36033,7 +36078,7 @@ expand_vec_perm_1 (struct expand_vec_per
            }
        }
 
-      if (expand_vselect (d->target, d->op0, perm2, nelt))
+      if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
        return true;
 
       /* There are plenty of patterns in sse.md that are written for
@@ -36047,7 +36092,8 @@ expand_vec_perm_1 (struct expand_vec_per
          perm2[i] = d->perm[i] & mask;
          perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
        }
-      if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
+      if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
+                                 d->testing_p))
        return true;
 
       /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
@@ -36061,13 +36107,15 @@ expand_vec_perm_1 (struct expand_vec_per
              perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
            }
 
-         if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
+         if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
+                                     d->testing_p))
            return true;
        }
     }
 
   /* Finally, try the fully general two operand permute.  */
-  if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
+  if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
+                             d->testing_p))
     return true;
 
   /* Recognize interleave style patterns with reversed operands.  */
@@ -36083,7 +36131,8 @@ expand_vec_perm_1 (struct expand_vec_per
          perm2[i] = e;
        }
 
-      if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
+      if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
+                                 d->testing_p))
        return true;
     }
 
@@ -36131,14 +36180,14 @@ expand_vec_perm_pshuflw_pshufhw (struct
   memcpy (perm2, d->perm, 4);
   for (i = 4; i < 8; ++i)
     perm2[i] = i;
-  ok = expand_vselect (d->target, d->op0, perm2, 8);
+  ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
   gcc_assert (ok);
 
   /* Emit the pshufhw.  */
   memcpy (perm2 + 4, d->perm + 4, 4);
   for (i = 0; i < 4; ++i)
     perm2[i] = i;
-  ok = expand_vselect (d->target, d->target, perm2, 8);
+  ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
   gcc_assert (ok);
 
   return true;
@@ -37723,7 +37772,8 @@ expand_vec_perm_broadcast_1 (struct expa
       while (vmode != V4SImode);
 
       memset (perm2, elt, 4);
-      ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
+      ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
+                          d->testing_p);
       gcc_assert (ok);
       return true;
 

        Jakub

Reply via email to