Hi!

On
#define N 4096
unsigned int b[N], d[N];

void
foo (void)
{
  int i;
  for (i = 0; i < N; i++)
    d[i] = b[i] / 3;
}
testcase I was looking earlier today because of the XOP issues,
I've noticed we generate unnecessary code:
        vmovdqa .LC0(%rip), %ymm2
...
        vpsrlq  $32, %ymm2, %ymm3
before the loop and in the loop:
        vmovdqa b(%rax), %ymm0
        vpmuludq        b(%rax), %ymm2, %ymm1
...
        vpsrlq  $32, %ymm0, %ymm0
        vpmuludq        %ymm3, %ymm0, %ymm0
...
.LC0:
        .long   -1431655765
        .long   -1431655765
        .long   -1431655765
        .long   -1431655765
        .long   -1431655765
        .long   -1431655765
        .long   -1431655765
        .long   -1431655765
The first vpsrlq and having an extra register live across the loop is not
needed, if each pair of constants in the constant vector is equal, we can
just use .LC0(%rip) (i.e. %ymm2 above) in both places.

Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux.

Apparently ix86_expand_binop_builtin wasn't prepared for NULL predicates
(but generic code is), alternatively perhaps I could add a predicate
that would accept nonimmediate_operand or CONSTANT_VECTOR if that is
preferrable that way.
Also, not sure if force_reg or copy_to_mode_reg is preferrable.

2013-04-26  Jakub Jelinek  <ja...@redhat.com>

        * config/i386/i386.c (ix86_expand_binop_builtin): Allow NULL
        predicate.
        (const_vector_equal_evenodd_p): New function.
        (ix86_expand_mul_widen_evenodd): Force op1 resp. op2 into register
        if they aren't nonimmediate operands.  If their original values
        satisfy const_vector_equal_evenodd_p, don't shift them.
        * config/i386/sse.md (mul<mode>3): Remove predicates.  For the
        SSE4.1 case force operands[{1,2}] into registers if not
        nonimmediate_operand.
        (vec_widen_smult_even_v4si): Use nonimmediate_operand predicates
        instead of register_operand.
        (vec_widen_<s>mult_odd_<mode>): Remove predicates.

--- gcc/config/i386/i386.c.jj   2013-04-26 15:11:37.000000000 +0200
+++ gcc/config/i386/i386.c      2013-04-26 19:03:54.777293448 +0200
@@ -30149,9 +30150,11 @@ ix86_expand_binop_builtin (enum insn_cod
       op1 = gen_lowpart (TImode, x);
     }
 
-  if (!insn_data[icode].operand[1].predicate (op0, mode0))
+  if (insn_data[icode].operand[1].predicate
+      && !insn_data[icode].operand[1].predicate (op0, mode0))
     op0 = copy_to_mode_reg (mode0, op0);
-  if (!insn_data[icode].operand[2].predicate (op1, mode1))
+  if (insn_data[icode].operand[2].predicate
+      && !insn_data[icode].operand[2].predicate (op1, mode1))
     op1 = copy_to_mode_reg (mode1, op1);
 
   pat = GEN_FCN (icode) (target, op0, op1);
@@ -40826,6 +40829,24 @@ ix86_expand_vecop_qihi (enum rtx_code co
                       gen_rtx_fmt_ee (code, qimode, op1, op2));
 }
 
+/* Helper function of ix86_expand_mul_widen_evenodd.  Return true
+   if op is CONST_VECTOR with all odd elements equal to their
+   preceeding element.  */
+
+static bool
+const_vector_equal_evenodd_p (rtx op)
+{
+  enum machine_mode mode = GET_MODE (op);
+  int i, nunits = GET_MODE_NUNITS (mode);
+  if (GET_CODE (op) != CONST_VECTOR
+      || nunits != CONST_VECTOR_NUNITS (op))
+    return false;
+  for (i = 0; i < nunits; i += 2)
+    if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
+      return false;
+  return true;
+}
+
 void
 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
                               bool uns_p, bool odd_p)
@@ -40833,6 +40854,12 @@ ix86_expand_mul_widen_evenodd (rtx dest,
   enum machine_mode mode = GET_MODE (op1);
   enum machine_mode wmode = GET_MODE (dest);
   rtx x;
+  rtx orig_op1 = op1, orig_op2 = op2;
+
+  if (!nonimmediate_operand (op1, mode))
+    op1 = force_reg (mode, op1);
+  if (!nonimmediate_operand (op2, mode))
+    op2 = force_reg (mode, op2);
 
   /* We only play even/odd games with vectors of SImode.  */
   gcc_assert (mode == V4SImode || mode == V8SImode);
@@ -40849,10 +40876,12 @@ ix86_expand_mul_widen_evenodd (rtx dest,
        }
 
       x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
-      op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
-                         x, NULL, 1, OPTAB_DIRECT);
-      op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
-                         x, NULL, 1, OPTAB_DIRECT);
+      if (!const_vector_equal_evenodd_p (orig_op1))
+       op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
+                           x, NULL, 1, OPTAB_DIRECT);
+      if (!const_vector_equal_evenodd_p (orig_op2))
+       op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
+                           x, NULL, 1, OPTAB_DIRECT);
       op1 = gen_lowpart (mode, op1);
       op2 = gen_lowpart (mode, op2);
     }
--- gcc/config/i386/sse.md.jj   2013-04-26 15:11:37.000000000 +0200
+++ gcc/config/i386/sse.md      2013-04-26 18:59:03.838753277 +0200
@@ -5631,14 +5631,16 @@ (define_insn "*sse2_pmaddwd"
 (define_expand "mul<mode>3"
   [(set (match_operand:VI4_AVX2 0 "register_operand")
        (mult:VI4_AVX2
-         (match_operand:VI4_AVX2 1 "nonimmediate_operand")
-         (match_operand:VI4_AVX2 2 "nonimmediate_operand")))]
+         (match_operand:VI4_AVX2 1)
+         (match_operand:VI4_AVX2 2)))]
   "TARGET_SSE2"
 {
   if (TARGET_SSE4_1)
     {
-      if (CONSTANT_P (operands[2]))
-       operands[2] = force_const_mem (<MODE>mode, operands[2]);
+      if (!nonimmediate_operand (operands[1], <MODE>mode))
+       operands[1] = force_reg (<MODE>mode, operands[1]);
+      if (!nonimmediate_operand (operands[2], <MODE>mode))
+       operands[2] = force_reg (<MODE>mode, operands[2]);
       ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
     }
   else
@@ -5702,8 +5704,8 @@ (define_expand "vec_widen_<s>mult_lo_<mo
 ;; named patterns, but signed V4SI needs special help for plain SSE2.
 (define_expand "vec_widen_smult_even_v4si"
   [(match_operand:V2DI 0 "register_operand")
-   (match_operand:V4SI 1 "register_operand")
-   (match_operand:V4SI 2 "register_operand")]
+   (match_operand:V4SI 1 "nonimmediate_operand")
+   (match_operand:V4SI 2 "nonimmediate_operand")]
   "TARGET_SSE2"
 {
   ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2],
@@ -5714,8 +5716,8 @@ (define_expand "vec_widen_smult_even_v4s
 (define_expand "vec_widen_<s>mult_odd_<mode>"
   [(match_operand:<sseunpackmode> 0 "register_operand")
    (any_extend:<sseunpackmode>
-     (match_operand:VI4_AVX2 1 "register_operand"))
-   (match_operand:VI4_AVX2 2 "register_operand")]
+     (match_operand:VI4_AVX2 1))
+   (match_operand:VI4_AVX2 2)]
   "TARGET_SSE2"
 {
   ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2],

        Jakub

Reply via email to