On Fri, Feb 03, 2017 at 06:07:56PM -0600, Segher Boessenkool wrote:
> On Fri, Feb 03, 2017 at 04:25:00PM -0500, Michael Meissner wrote:
> > +;; Return 1 if operand is either a vector constant of all 0 bits of a 
> > vector
> > +;; constant of all 1 bits.
> > +(define_predicate "vector_int_same_bit"
> > +  (match_code "const_vector")
> > +{
> > +  if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> > +    return 0;
> > +
> > +  else
> > +    return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
> > +})
> 
> This predicate is unused as far as I see?

I removed it.  Thanks.

> > +  /* Optimize vec1 == vec2, to know the mask generates -1/0.  */
> > +  if (GET_MODE_CLASS (dest_mode) == MODE_VECTOR_INT)
> >      {
> > -      tmp = op_true;
> > -      op_true = op_false;
> > -      op_false = tmp;
> > +      if (op_true == constant_m1 && op_false == constant_0)
> > +   {
> > +     emit_move_insn (dest, mask);
> > +     return 1;
> > +   }
> > +
> > +      else if (op_true == constant_0 && op_false == constant_m1)
> > +   {
> > +     emit_insn (gen_rtx_SET (dest, gen_rtx_NOT (dest_mode, mask)));
> > +     return 1;
> > +   }
> >      }
> 
> Do you need to test for dest_mode == mask_mode here, like below?

I reworked the test, see below.

> > +  if (op_true == constant_m1 && dest_mode == mask_mode)
> > +    op_true = mask;
> > +  else if (!REG_P (op_true) && !SUBREG_P (op_true))
> > +    op_true = force_reg (dest_mode, op_true);
> > +
> > +  if (op_false == constant_0 && dest_mode == mask_mode)
> > +    op_false = mask;
> > +  else if (!REG_P (op_false) && !SUBREG_P (op_false))
> > +    op_false = force_reg (dest_mode, op_false);
> 
> Another thing you could try is, if either op_true or op_false is 0
> or -1, let the result be
>       (mask & op_true) | (~mask & op_false)
> 
> and let the rest of the optimisers sort it out (it's a single vor/vand
> or vorc/vandc, or a vnot, or nothing).  A later improvement perhaps.
> Or does it already handle all cases now :-)

That will be a later optimization if desired.  I reran the tests with no
regressions.

When I reran the spec benchmark in more controlled conditions, I wasn't able to
reproduce the gcc speedups like I saw with a quick run.  I did see the tonto
speedups with these changes.

As I said, but perhaps I wasn't clear.  I will be in the office on Tuesday, but
I will be on vacation starting on Wednesday.  I would prefer checking in the
changes today (Monday) just in case something needs fixing before I leave.  Can
I check it into the trunk?

[gcc]
2017-02-06  Michael Meissner  <meiss...@linux.vnet.ibm.com>

        PR target/66144
        * config/rs6000/vector.md (vcond<mode><mode>): Allow the true and
        false values to be constant vectors with all 0 or all 1 bits set.
        (vcondu<mode><mode>): Likewise.
        * config/rs6000/predicates.md (vector_int_reg_or_same_bit): New
        predicate.
        (fpmask_comparison_operator): Update comment.
        (vecint_comparison_operator): New predicate.
        * config/rs6000/rs6000.c (rs6000_emit_vector_cond_expr): Optimize
        vector conditionals when the true and false values are constant
        vectors with all 0 bits or all 1 bits set.

[gcc/testsuite]
2017-02-06  Michael Meissner  <meiss...@linux.vnet.ibm.com>

        PR target/66144
        * gcc.target/powerpc/pr66144-1.c: New test.
        * gcc.target/powerpc/pr66144-2.c: Likewise.
        * gcc.target/powerpc/pr66144-3.c: Likewise.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meiss...@linux.vnet.ibm.com, phone: +1 (978) 899-4797
Index: gcc/config/rs6000/predicates.md
===================================================================
--- gcc/config/rs6000/predicates.md     (revision 245137)
+++ gcc/config/rs6000/predicates.md     (working copy)
@@ -808,6 +808,21 @@ (define_predicate "all_ones_constant"
   (and (match_code "const_int,const_double,const_wide_int,const_vector")
        (match_test "op == CONSTM1_RTX (mode) && !FLOAT_MODE_P (mode)")))
 
+;; Return 1 if operand is a vector int register or is either a vector constant
+;; of all 0 bits of a vector constant of all 1 bits.
+(define_predicate "vector_int_reg_or_same_bit"
+  (match_code "reg,subreg,const_vector")
+{
+  if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
+    return 0;
+
+  else if (REG_P (op) || SUBREG_P (op))
+    return vint_operand (op, mode);
+
+  else
+    return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
+})
+
 ;; Return 1 if operand is 0.0.
 (define_predicate "zero_fp_constant"
   (and (match_code "const_double")
@@ -1260,8 +1275,8 @@ (define_predicate "scc_rev_comparison_op
   (and (match_operand 0 "branch_comparison_operator")
        (match_code "ne,le,ge,leu,geu,ordered")))
 
-;; Return 1 if OP is a comparison operator suitable for vector/scalar
-;; comparisons that generate a -1/0 mask.
+;; Return 1 if OP is a comparison operator suitable for floating point
+;; vector/scalar comparisons that generate a -1/0 mask.
 (define_predicate "fpmask_comparison_operator"
   (match_code "eq,gt,ge"))
 
@@ -1271,6 +1286,11 @@ (define_predicate "fpmask_comparison_ope
 (define_predicate "invert_fpmask_comparison_operator"
   (match_code "ne,unlt,unle"))
 
+;; Return 1 if OP is a comparison operation suitable for integer vector/scalar
+;; comparisons that generate a -1/0 mask.
+(define_predicate "vecint_comparison_operator"
+  (match_code "eq,gt,gtu"))
+
 ;; Return 1 if OP is a comparison operation that is valid for a branch
 ;; insn, which is true if the corresponding bit in the CC register is set.
 (define_predicate "branch_positive_comparison_operator"
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c  (revision 245137)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -25122,7 +25122,6 @@ rs6000_emit_vector_cond_expr (rtx dest,
   machine_mode cc_mode = CCmode;
   rtx mask;
   rtx cond2;
-  rtx tmp;
   bool invert_move = false;
 
   if (VECTOR_UNIT_NONE_P (dest_mode))
@@ -25167,12 +25166,44 @@ rs6000_emit_vector_cond_expr (rtx dest,
     return 0;
 
   if (invert_move)
+    std::swap (op_true, op_false);
+
+  /* Optimize vec1 == vec2, to know the mask generates -1/0.  */
+  if (GET_MODE_CLASS (dest_mode) == MODE_VECTOR_INT
+      && (GET_CODE (op_true) == CONST_VECTOR
+         || GET_CODE (op_false) == CONST_VECTOR))
     {
-      tmp = op_true;
-      op_true = op_false;
-      op_false = tmp;
+      rtx constant_0 = CONST0_RTX (dest_mode);
+      rtx constant_m1 = CONSTM1_RTX (dest_mode);
+
+      if (op_true == constant_m1 && op_false == constant_0)
+       {
+         emit_move_insn (dest, mask);
+         return 1;
+       }
+
+      else if (op_true == constant_0 && op_false == constant_m1)
+       {
+         emit_insn (gen_rtx_SET (dest, gen_rtx_NOT (dest_mode, mask)));
+         return 1;
+       }
+
+      /* If we can't use the vector comparison directly, perhaps we can use
+        the mask for the true or false fields, instead of loading up a
+        constant.  */
+      if (op_true == constant_m1)
+       op_true = mask;
+
+      if (op_false == constant_0)
+       op_false = mask;
     }
 
+  if (!REG_P (op_true) && !SUBREG_P (op_true))
+    op_true = force_reg (dest_mode, op_true);
+
+  if (!REG_P (op_false) && !SUBREG_P (op_false))
+    op_false = force_reg (dest_mode, op_false);
+
   cond2 = gen_rtx_fmt_ee (NE, cc_mode, gen_lowpart (dest_mode, mask),
                          CONST0_RTX (dest_mode));
   emit_insn (gen_rtx_SET (dest,
Index: gcc/config/rs6000/vector.md
===================================================================
--- gcc/config/rs6000/vector.md (revision 245137)
+++ gcc/config/rs6000/vector.md (working copy)
@@ -390,13 +390,13 @@ (define_expand "vcond<mode><mode>"
 }")
 
 (define_expand "vcond<mode><mode>"
-  [(set (match_operand:VEC_I 0 "vint_operand" "")
+  [(set (match_operand:VEC_I 0 "vint_operand")
        (if_then_else:VEC_I
         (match_operator 3 "comparison_operator"
-                        [(match_operand:VEC_I 4 "vint_operand" "")
-                         (match_operand:VEC_I 5 "vint_operand" "")])
-        (match_operand:VEC_I 1 "vint_operand" "")
-        (match_operand:VEC_I 2 "vint_operand" "")))]
+                        [(match_operand:VEC_I 4 "vint_operand")
+                         (match_operand:VEC_I 5 "vint_operand")])
+        (match_operand:VEC_I 1 "vector_int_reg_or_same_bit")
+        (match_operand:VEC_I 2 "vector_int_reg_or_same_bit")))]
   "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
   "
 {
@@ -446,13 +446,13 @@ (define_expand "vcondv4siv4sf"
 }")
 
 (define_expand "vcondu<mode><mode>"
-  [(set (match_operand:VEC_I 0 "vint_operand" "")
+  [(set (match_operand:VEC_I 0 "vint_operand")
        (if_then_else:VEC_I
         (match_operator 3 "comparison_operator"
-                        [(match_operand:VEC_I 4 "vint_operand" "")
-                         (match_operand:VEC_I 5 "vint_operand" "")])
-        (match_operand:VEC_I 1 "vint_operand" "")
-        (match_operand:VEC_I 2 "vint_operand" "")))]
+                        [(match_operand:VEC_I 4 "vint_operand")
+                         (match_operand:VEC_I 5 "vint_operand")])
+        (match_operand:VEC_I 1 "vector_int_reg_or_same_bit")
+        (match_operand:VEC_I 2 "vector_int_reg_or_same_bit")))]
   "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
   "
 {
Index: gcc/testsuite/gcc.target/powerpc/pr66144-1.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr66144-1.c        (revision 0)
+++ gcc/testsuite/gcc.target/powerpc/pr66144-1.c        (working copy)
@@ -0,0 +1,20 @@
+/* { dg-do compile { target { powerpc64*-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
"-mcpu=power9" } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-mcpu=power9 -O2" } */
+
+/* Verify that we optimize vector1 = (vector2 == vector3) by not loading up
+   0/-1.  */
+
+vector int
+test (vector int a, vector int b)
+{
+  return a == b;
+}
+
+/* { dg-final { scan-assembler     {\mvcmpequw\M} } } */
+/* { dg-final { scan-assembler-not {\mxxspltib\M} } } */
+/* { dg-final { scan-assembler-not {\mvspltisw\M} } } */
+/* { dg-final { scan-assembler-not {\mxxlxor\M}   } } */
+/* { dg-final { scan-assembler-not {\mxxlxorc\M}  } } */
+/* { dg-final { scan-assembler-not {\mxxsel\M}    } } */
Index: gcc/testsuite/gcc.target/powerpc/pr66144-2.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr66144-2.c        (revision 0)
+++ gcc/testsuite/gcc.target/powerpc/pr66144-2.c        (working copy)
@@ -0,0 +1,21 @@
+/* { dg-do compile { target { powerpc64*-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
"-mcpu=power8" } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-mcpu=power8 -O2" } */
+
+/* Verify that we optimize vector1 = (vector2 != vector3) by not loading up
+   0/-1.  */
+
+vector unsigned char
+test (vector unsigned char a, vector unsigned char b)
+{
+  return a != b;
+}
+
+/* { dg-final { scan-assembler     {\mvcmpequb\M} } } */
+/* { dg-final { scan-assembler     {\mxxlnor\M}   } } */
+/* { dg-final { scan-assembler-not {\mxxspltib\M} } } */
+/* { dg-final { scan-assembler-not {\mvspltisw\M} } } */
+/* { dg-final { scan-assembler-not {\mxxlxor\M}   } } */
+/* { dg-final { scan-assembler-not {\mxxlxorc\M}  } } */
+/* { dg-final { scan-assembler-not {\mxxsel\M}    } } */
Index: gcc/testsuite/gcc.target/powerpc/pr66144-3.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr66144-3.c        (revision 0)
+++ gcc/testsuite/gcc.target/powerpc/pr66144-3.c        (working copy)
@@ -0,0 +1,27 @@
+/* { dg-do compile { target { powerpc64*-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
"-mcpu=power8" } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-mcpu=power8 -O2 -ftree-vectorize" } */
+
+/* Verify that we can optimize a vector conditional move, where one of the arms
+   is all 1's into using the mask as one of the inputs to XXSEL.  */
+
+#include <altivec.h>
+
+static int a[1024], b[1024], c[1024];
+
+int *p_a = a, *p_b = b, *p_c = c;
+
+void
+test (void)
+{
+  unsigned long i;
+
+  for (i = 0; i < 1024; i++)
+    a[i] = (b[i] == c[i]) ? -1 : a[i];
+}
+
+/* { dg-final { scan-assembler     {\mvcmpequw\M} } } */
+/* { dg-final { scan-assembler     {\mxxsel\M}    } } */
+/* { dg-final { scan-assembler-not {\mvspltisw\M} } } */
+/* { dg-final { scan-assembler-not {\mxxlorc\M}   } } */

Reply via email to