Pat Haugen noticed we were doing stores in benchmarks where we were doing
vector reductions to extract the final float element.  So I decided to take a
look.  This code implements the vector reductions without doing stores for
float vectors, and eliminates a vector shift for double vectors.

I suspect there are more opportunities for improving vector extract and insert
with VSX.

I did a bootstrap and make check with no regressions.  Is this ok to install on
the trunk?

[gcc]
2011-03-23  Michael Meissner  <meiss...@linux.vnet.ibm.com>

        PR target/48258
        * config/rs6000/vector.md (UNSPEC_REDUC): New unspec for vector
        reduction.
        (VEC_reduc): New code iterator and splitters for vector reduction.
        (VEC_reduc_name): Ditto.
        (VEC_reduc_rtx): Ditto.
        (reduc_<VEC_reduc_name>_v2df): Vector reduction expanders for VSX.
        (reduc_<VEC_reduc_name>_v4sf): Ditto.

        * config/rs6000/rs6000.c (rs6000_expand_vector_extract): Add
        support for extracting SF on VSX.

        * config/rs6000/vsx.md (vsx_xscvspdp_scalar2): New insn for
        generating xscvspdp.
        (vsx_extract_v4sf): New insn to extract SF from V4SF vector.
        (vsx_reduc_<VEC_reduc_name>_v2df): New insns and splitters for
        double add, minimum, maximum vector reduction.
        (vsx_reduc_<VEC_reduc_name>_v4sf): Ditto.
        (vsx_reduc_<VEC_reduc_name>_v2df2_scalar): New combiner insn to
        optimize double vector reduction.
        (vsx_reduc_<VEC_reduc_name>_v4sf_scalar): Ditto.

[gcc/testsuite]
2011-03-23  Michael Meissner  <meiss...@linux.vnet.ibm.com>

        PR target/48258
        * gcc.target/powerpc/pr48258-1.c: New file.
        * gcc.target/powerpc/pr48258-2.c: Ditto.


-- 
Michael Meissner, IBM
5 Technology Place Drive, M/S 2757, Westford, MA 01886-3141, USA
meiss...@linux.vnet.ibm.com     fax +1 (978) 399-6899
Index: gcc/config/rs6000/vector.md
===================================================================
--- gcc/config/rs6000/vector.md (revision 171306)
+++ gcc/config/rs6000/vector.md (working copy)
@@ -74,7 +74,19 @@ (define_mode_attr VEC_INT [(V4SF  "V4SI"
                           (V2DF  "V2DI")])
 
 ;; constants for unspec
-(define_c_enum "unspec" [UNSPEC_PREDICATE])
+(define_c_enum "unspec" [UNSPEC_PREDICATE
+                        UNSPEC_REDUC])
+
+;; Vector reduction code iterators
+(define_code_iterator VEC_reduc [plus smin smax])
+
+(define_code_attr VEC_reduc_name [(plus "splus")
+                                 (smin "smin")
+                                 (smax "smax")])
+
+(define_code_attr VEC_reduc_rtx [(plus "add")
+                                (smin "smin")
+                                (smax "smax")])
 
 
 ;; Vector move instructions.
@@ -991,6 +1003,41 @@ (define_expand "vashr<mode>3"
   "TARGET_ALTIVEC"
   "")
 
+;; Vector reduction expanders for VSX
+
+(define_expand "reduc_<VEC_reduc_name>_v2df"
+  [(parallel [(set (match_operand:V2DF 0 "vfloat_operand" "")
+                  (VEC_reduc:V2DF
+                   (vec_concat:V2DF
+                    (vec_select:DF
+                     (match_operand:V2DF 1 "vfloat_operand" "")
+                     (parallel [(const_int 1)]))
+                    (vec_select:DF
+                     (match_dup 1)
+                     (parallel [(const_int 0)])))
+                   (match_dup 1)))
+             (clobber (match_scratch:V2DF 2 ""))])]
+  "VECTOR_UNIT_VSX_P (V2DFmode)"
+  "")
+
+; The (VEC_reduc:V4SF
+;      (op1)
+;      (unspec:V4SF [(const_int 0)] UNSPEC_REDUC))
+;
+; is to allow us to use a code iterator, but not completely list all of the
+; vector rotates, etc. to prevent canonicalization
+
+(define_expand "reduc_<VEC_reduc_name>_v4sf"
+  [(parallel [(set (match_operand:V4SF 0 "vfloat_operand" "")
+                  (VEC_reduc:V4SF
+                   (unspec:V4SF [(const_int 0)] UNSPEC_REDUC)
+                   (match_operand:V4SF 1 "vfloat_operand" "")))
+             (clobber (match_scratch:V4SF 2 ""))
+             (clobber (match_scratch:V4SF 3 ""))])]
+  "VECTOR_UNIT_VSX_P (V4SFmode)"
+  "")
+
+
 ;;; Expanders for vector insn patterns shared between the SPE and 
TARGET_PAIRED systems.
 
 (define_expand "absv2sf2"
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c  (revision 171306)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -5492,12 +5492,22 @@ rs6000_expand_vector_extract (rtx target
   enum machine_mode inner_mode = GET_MODE_INNER (mode);
   rtx mem;
 
-  if (VECTOR_MEM_VSX_P (mode) && (mode == V2DFmode || mode == V2DImode))
+  if (VECTOR_MEM_VSX_P (mode))
     {
-      rtx (*extract_func) (rtx, rtx, rtx)
-       = ((mode == V2DFmode) ? gen_vsx_extract_v2df : gen_vsx_extract_v2di);
-      emit_insn (extract_func (target, vec, GEN_INT (elt)));
-      return;
+      switch (mode)
+       {
+       default:
+         break;
+       case V2DFmode:
+         emit_insn (gen_vsx_extract_v2df (target, vec, GEN_INT (elt)));
+         return;
+       case V2DImode:
+         emit_insn (gen_vsx_extract_v2di (target, vec, GEN_INT (elt)));
+         return;
+       case V4SFmode:
+         emit_insn (gen_vsx_extract_v4sf (target, vec, GEN_INT (elt)));
+         return;
+       }
     }
 
   /* Allocate mode-sized buffer.  */
Index: gcc/config/rs6000/vsx.md
===================================================================
--- gcc/config/rs6000/vsx.md    (revision 171306)
+++ gcc/config/rs6000/vsx.md    (working copy)
@@ -829,6 +829,15 @@ (define_insn "vsx_xscvdpsp_scalar"
   "xscvdpsp %x0,%x1"
   [(set_attr "type" "fp")])
 
+;; Same as vsx_xscvspdp, but use SF as the type
+(define_insn "vsx_xscvspdp_scalar2"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=f")
+       (unspec:SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+                  UNSPEC_VSX_CVSPDP))]
+  "VECTOR_UNIT_VSX_P (DFmode)"
+  "xscvspdp %x0,%x1"
+  [(set_attr "type" "fp")])
+
 ;; Convert from 64-bit to 32-bit types
 ;; Note, favor the Altivec registers since the usual use of these instructions
 ;; is in vector converts and we need to use the Altivec vperm instruction.
@@ -1039,6 +1048,43 @@ (define_insn "*vsx_extract_<mode>_zero"
   [(set_attr "type" "fpload")
    (set_attr "length" "4")])  
 
+;; Extract a SF element from V4SF
+(define_insn_and_split "vsx_extract_v4sf"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=f,f")
+       (vec_select:SF
+        (match_operand:V4SF 1 "vsx_register_operand" "wa,wa")
+        (parallel [(match_operand:QI 2 "u5bit_cint_operand" "O,i")])))
+   (clobber (match_scratch:V4SF 3 "=X,0"))]
+  "VECTOR_UNIT_VSX_P (V4SFmode)"
+  "@
+   xscvspdp %x0,%x1
+   #"
+  ""
+  [(const_int 0)]
+  "
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  rtx tmp;
+  HOST_WIDE_INT ele = INTVAL (op2);
+
+  if (ele == 0)
+    tmp = op1;
+  else
+    {
+      if (GET_CODE (op3) == SCRATCH)
+       op3 = gen_reg_rtx (V4SFmode);
+      emit_insn (gen_vsx_xxsldwi_v4sf (op3, op1, op1, op2));
+      tmp = op3;
+    }
+  emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp));
+  DONE;
+}"
+  [(set_attr "length" "4,8")
+   (set_attr "type" "fp")])
+
 ;; General double word oriented permute, allow the other vector types for
 ;; optimizing the permute instruction.
 (define_insn "vsx_xxpermdi_<mode>"
@@ -1076,7 +1122,7 @@ (define_insn "*vsx_xxpermdi2_<mode>"
 (define_insn "vsx_splat_<mode>"
   [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wd,wd,wd,?wa,?wa,?wa")
        (vec_duplicate:VSX_D
-        (match_operand:<VS_scalar> 1 "splat_input_operand" "ws,f,Z,wa,wa,Z")))]
+        (match_operand:<VS_scalar> 1 "input_operand" "ws,f,Z,wa,wa,Z")))]
   "VECTOR_MEM_VSX_P (<MODE>mode)"
   "@
    xxpermdi %x0,%x1,%x1,0
@@ -1150,3 +1196,153 @@ (define_insn "vsx_xxsldwi_<mode>"
   "VECTOR_MEM_VSX_P (<MODE>mode)"
   "xxsldwi %x0,%x1,%x2,%3"
   [(set_attr "type" "vecperm")])
+
+
+;; Vector reduction insns and splitters
+
+(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v2df"
+  [(set (match_operand:V2DF 0 "vfloat_operand" "=&wd,&?wa,wd,?wa")
+       (VEC_reduc:V2DF
+        (vec_concat:V2DF
+         (vec_select:DF
+          (match_operand:V2DF 1 "vfloat_operand" "wd,wa,wd,wa")
+          (parallel [(const_int 1)]))
+         (vec_select:DF
+          (match_dup 1)
+          (parallel [(const_int 0)])))
+        (match_dup 1)))
+   (clobber (match_scratch:V2DF 2 "=0,0,&wd,&wa"))]
+  "VECTOR_UNIT_VSX_P (V2DFmode)"
+  "#"
+  ""
+  [(const_int 0)]
+  "
+{
+  rtx tmp = (GET_CODE (operands[2]) == SCRATCH)
+            ? gen_reg_rtx (V2DFmode)
+            : operands[2];
+  emit_insn (gen_vsx_xxsldwi_v2df (tmp, operands[1], operands[1], const2_rtx));
+  emit_insn (gen_<VEC_reduc_rtx>v2df3 (operands[0], tmp, operands[1]));
+  DONE;
+}"
+  [(set_attr "length" "8")
+   (set_attr "type" "veccomplex")])
+
+(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v4sf"
+  [(set (match_operand:V4SF 0 "vfloat_operand" "=wf,?wa")
+       (VEC_reduc:V4SF
+        (unspec:V4SF [(const_int 0)] UNSPEC_REDUC)
+        (match_operand:V4SF 1 "vfloat_operand" "wf,wa")))
+   (clobber (match_scratch:V4SF 2 "=&wf,&wa"))
+   (clobber (match_scratch:V4SF 3 "=&wf,&wa"))]
+  "VECTOR_UNIT_VSX_P (V4SFmode)"
+  "#"
+  ""
+  [(const_int 0)]
+  "
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx tmp2, tmp3, tmp4;
+
+  if (can_create_pseudo_p ())
+    {
+      tmp2 = gen_reg_rtx (V4SFmode);
+      tmp3 = gen_reg_rtx (V4SFmode);
+      tmp4 = gen_reg_rtx (V4SFmode);
+    }
+  else
+    {
+      tmp2 = operands[2];
+      tmp3 = operands[3];
+      tmp4 = tmp2;
+    }
+
+  emit_insn (gen_vsx_xxsldwi_v4sf (tmp2, op1, op1, const2_rtx));
+  emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp3, tmp2, op1));
+  emit_insn (gen_vsx_xxsldwi_v4sf (tmp4, tmp3, tmp3, GEN_INT (3)));
+  emit_insn (gen_<VEC_reduc_rtx>v4sf3 (op0, tmp4, tmp3));
+  DONE;
+}"
+  [(set_attr "length" "16")
+   (set_attr "type" "veccomplex")])
+
+;; Combiner patterns with the vector reduction patterns that knows we can get
+;; to the top element of the V2DF array without doing an extract.
+
+(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v2df_scalar"
+  [(set (match_operand:DF 0 "vfloat_operand" "=&ws,&?wa,ws,?wa")
+       (vec_select:DF
+        (VEC_reduc:V2DF
+         (vec_concat:V2DF
+          (vec_select:DF
+           (match_operand:V2DF 1 "vfloat_operand" "wd,wa,wd,wa")
+           (parallel [(const_int 1)]))
+          (vec_select:DF
+           (match_dup 1)
+           (parallel [(const_int 0)])))
+         (match_dup 1))
+        (parallel [(const_int 1)])))
+   (clobber (match_scratch:DF 2 "=0,0,&wd,&wa"))]
+  "VECTOR_UNIT_VSX_P (V2DFmode)"
+  "#"
+  ""
+  [(const_int 0)]
+  "
+{
+  rtx hi = gen_highpart (DFmode, operands[1]);
+  rtx lo = (GET_CODE (operands[2]) == SCRATCH)
+           ? gen_reg_rtx (DFmode)
+           : operands[2];
+
+  emit_insn (gen_vsx_extract_v2df (lo, operands[1], const1_rtx));
+  emit_insn (gen_<VEC_reduc_rtx>df3 (operands[0], hi, lo));
+  DONE;
+}"
+  [(set_attr "length" "8")
+   (set_attr "type" "veccomplex")])
+
+(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v4sf_scalar"
+  [(set (match_operand:SF 0 "vfloat_operand" "=f,?f")
+       (vec_select:SF
+        (VEC_reduc:V4SF
+         (unspec:V4SF [(const_int 0)] UNSPEC_REDUC)
+         (match_operand:V4SF 1 "vfloat_operand" "wf,wa"))
+        (parallel [(const_int 3)])))
+   (clobber (match_scratch:V4SF 2 "=&wf,&wa"))
+   (clobber (match_scratch:V4SF 3 "=&wf,&wa"))
+   (clobber (match_scratch:V4SF 4 "=0,0"))]
+  "VECTOR_UNIT_VSX_P (V4SFmode)"
+  "#"
+  ""
+  [(const_int 0)]
+  "
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx tmp2, tmp3, tmp4, tmp5;
+
+  if (can_create_pseudo_p ())
+    {
+      tmp2 = gen_reg_rtx (V4SFmode);
+      tmp3 = gen_reg_rtx (V4SFmode);
+      tmp4 = gen_reg_rtx (V4SFmode);
+      tmp5 = gen_reg_rtx (V4SFmode);
+    }
+  else
+    {
+      tmp2 = operands[2];
+      tmp3 = operands[3];
+      tmp4 = tmp2;
+      tmp5 = operands[4];
+    }
+
+  emit_insn (gen_vsx_xxsldwi_v4sf (tmp2, op1, op1, const2_rtx));
+  emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp3, tmp2, op1));
+  emit_insn (gen_vsx_xxsldwi_v4sf (tmp4, tmp3, tmp3, GEN_INT (3)));
+  emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp5, tmp4, tmp3));
+  emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp5));
+  DONE;
+}"
+  [(set_attr "length" "20")
+   (set_attr "type" "veccomplex")])

Reply via email to