Assuming first part of the patch is committed. Is the following patch
ok? It passes bootstrap and make check.

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 91f6f21..475448e 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -42808,6 +42808,7 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
 }

 static bool expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d);
+static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d);

 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
    in a single instruction.  */
@@ -42943,6 +42944,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_vpermil (d))
     return true;

+  /* Try palignr on one operand.  */
+  if (d->one_operand_p && expand_vec_perm_palignr (d))
+    return true;
+
   /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
      vpshufb, vpermd, vpermps or vpermq variable permutation.  */
   if (expand_vec_perm_pshufb (d))
@@ -43040,22 +43045,36 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)
   else
     return false;

-  min = 2 * nelt, max = 0;
-  for (i = 0; i < nelt; ++i)
+  /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
+     PALIGNR is better than any other permutaion expand.
+     Check for a rotation in permutation.  */
+  if (d->one_operand_p)
     {
-      unsigned e = d->perm[i];
-      if (e < min)
-       min = e;
-      if (e > max)
-       max = e;
+      min = d->perm[0];
+      for (i = 1; i < nelt; ++i)
+       if (d->perm[i] != ((min + i) & (nelt - 1)))
+         return false;
     }
-  if (min == 0 || max - min >= nelt)
-    return false;
+  /* For a 2 operand permutaion we check if elements fit within one vector.  */
+  else
+    {
+      min = 2 * nelt, max = 0;
+      for (i = 0; i < nelt; ++i)
+       {
+         unsigned e = d->perm[i];
+         if (e < min)
+           min = e;
+         if (e > max)
+           max = e;
+       }
+      if (min == 0 || max - min >= nelt)
+       return false;

-  /* Given that we have SSSE3, we know we'll be able to implement the
-     single operand permutation after the palignr with pshufb.  */
-  if (d->testing_p)
-    return true;
+      /* Given that we have SSSE3, we know we'll be able to implement the
+        single operand permutation after the palignr with pshufb.  */
+      if (d->testing_p)
+       return true;
+    }

   dcopy = *d;
   shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
@@ -43089,6 +43108,14 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)
     }


+  /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
+     the rest single operand permutation is just move.  */
+  if (d->one_operand_p)
+    {
+      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+      return true;
+    }
+
   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
   dcopy.one_operand_p = true;


On Wed, Apr 30, 2014 at 6:25 PM, Evgeny Stupachenko <evstu...@gmail.com> wrote:
> On Tue, Apr 29, 2014 at 9:39 PM, Richard Henderson <r...@redhat.com> wrote:
>> On 04/29/2014 10:13 AM, Evgeny Stupachenko wrote:
>>> +  /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
>>> +     PALIGNR is better than PSHUFB.  Check for a rotation in permutation.  
>>> */
>>> +  for (i = 0; i < nelt; ++i)
>>> +    if ((((d->perm[(i + 1) & (nelt - 1)] - d->perm[i])) & (nelt - 1)) != 1)
>>> +      return false;
>>> +
>>> +  min = d->perm[0];
>>
>> Why are you running this loop NELT times instead of NELT-1 like I suggested?
>> Why is that test expression so complicated?
>>
>> Obviously d->perm[0] is the anchor and everything else can be computed 
>> relative
>> to that.  I'd expect no more than
>>
>>   min = d->perm[0];
>>   for (i = 1; i < nelt; ++i)
>>     if (d->perm[i] != ((min + i) & (nelt - 1)))
>>       return false;
>
> Agree on this. The loop is less complicated.
>
>>
>> Now that I think of it,
>>
>>> +  /* PALIGNR of 2 128-bits registers takes only 1 instrucion.
>>> +     Requires SSSE3.  */
>>> +  if (GET_MODE_SIZE (d->vmode) == 16)
>>> +    {
>>> +      if (!TARGET_SSSE3)
>>> +       return false;
>>> +    }
>>> +  /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions:
>>> +     PERM and PALIGNR.  It is more profitable than 2 PSHUFB and PERM.  */
>>> +  else if (GET_MODE_SIZE (d->vmode) == 32)
>>> +    {
>>> +      if (!TARGET_AVX2)
>>> +       return false;
>>> +    }
>>> +  else
>>> +    return false;
>>> +
>>> +  if (!d->one_operand_p)
>>> +    return false;
>>
>> The comments are wrong.  Move the operand_p test to the top,
>> where it should be, and they're more obviously wrong:
>>
>>   "must have one operand"
>>   "palignr of two operands..."
>>
>> I think your comment
>>
>>   /* For a rotaion permutation with one operand like: {5 6 7 0 1 2 3 4}
>>      we want to use PALIGNR.  */
>>
>> belongs up there instead of those two incorrect comments.
>>
>
> What I mean in the comments
>   16 bytes case:
>     For 1 operand permutation
>       Rotation will cost "palignr" which is better than "pshufb" as
> has smaller opcode (6 vs 9) and always costs 1 tick (pshufb takes 3-5
> ticks on some x86 archs).
>     For 2 operands permutation
>       If "palignr" is applicable it reduces instructions from: "2
> pshufb and or" to "palignr and pshufb". Profitable for the same
> reasons as above.
>   32 bytes case:
>     For 1 operand permutation
>       Rotation will cost only 2 instruction "palignr and perm" which
> is better than "2 pshufb and perm".
>     For 2 operands permutation
>       If palignr is applicable it reduces instructions from: "4 pshufb
> 2 perm and or" to "palignr, 2 pshufb, perm and or" and profitable for
> the same reasons as above.
>
> So the final reason is the same for 1 and 2 operands case. However I
> agree to extend the comments as they are not clear.
> Maybe we should unite one and two operand case into 1 function? I can
> submit such patch when patch 1/2 is committed.
>
> Thanks,
> Evgeny
>
>>
>>
>> r~

Reply via email to