Pat Haugen noticed we were doing stores in benchmarks where we were doing vector reductions to extract the final float element. So I decided to take a look. This code implements the vector reductions without doing stores for float vectors, and eliminates a vector shift for double vectors.
I suspect there are more opportunities for improving vector extract and insert with VSX. I did a bootstrap and make check with no regressions. Is this ok to install on the trunk? [gcc] 2011-03-23 Michael Meissner <meiss...@linux.vnet.ibm.com> PR target/48258 * config/rs6000/vector.md (UNSPEC_REDUC): New unspec for vector reduction. (VEC_reduc): New code iterator and splitters for vector reduction. (VEC_reduc_name): Ditto. (VEC_reduc_rtx): Ditto. (reduc_<VEC_reduc_name>_v2df): Vector reduction expanders for VSX. (reduc_<VEC_reduc_name>_v4sf): Ditto. * config/rs6000/rs6000.c (rs6000_expand_vector_extract): Add support for extracting SF on VSX. * config/rs6000/vsx.md (vsx_xscvspdp_scalar2): New insn for generating xscvspdp. (vsx_extract_v4sf): New insn to extract SF from V4SF vector. (vsx_reduc_<VEC_reduc_name>_v2df): New insns and splitters for double add, minimum, maximum vector reduction. (vsx_reduc_<VEC_reduc_name>_v4sf): Ditto. (vsx_reduc_<VEC_reduc_name>_v2df2_scalar): New combiner insn to optimize double vector reduction. (vsx_reduc_<VEC_reduc_name>_v4sf_scalar): Ditto. [gcc/testsuite] 2011-03-23 Michael Meissner <meiss...@linux.vnet.ibm.com> PR target/48258 * gcc.target/powerpc/pr48258-1.c: New file. * gcc.target/powerpc/pr48258-2.c: Ditto. -- Michael Meissner, IBM 5 Technology Place Drive, M/S 2757, Westford, MA 01886-3141, USA meiss...@linux.vnet.ibm.com fax +1 (978) 399-6899
Index: gcc/config/rs6000/vector.md =================================================================== --- gcc/config/rs6000/vector.md (revision 171306) +++ gcc/config/rs6000/vector.md (working copy) @@ -74,7 +74,19 @@ (define_mode_attr VEC_INT [(V4SF "V4SI" (V2DF "V2DI")]) ;; constants for unspec -(define_c_enum "unspec" [UNSPEC_PREDICATE]) +(define_c_enum "unspec" [UNSPEC_PREDICATE + UNSPEC_REDUC]) + +;; Vector reduction code iterators +(define_code_iterator VEC_reduc [plus smin smax]) + +(define_code_attr VEC_reduc_name [(plus "splus") + (smin "smin") + (smax "smax")]) + +(define_code_attr VEC_reduc_rtx [(plus "add") + (smin "smin") + (smax "smax")]) ;; Vector move instructions. @@ -991,6 +1003,41 @@ (define_expand "vashr<mode>3" "TARGET_ALTIVEC" "") +;; Vector reduction expanders for VSX + +(define_expand "reduc_<VEC_reduc_name>_v2df" + [(parallel [(set (match_operand:V2DF 0 "vfloat_operand" "") + (VEC_reduc:V2DF + (vec_concat:V2DF + (vec_select:DF + (match_operand:V2DF 1 "vfloat_operand" "") + (parallel [(const_int 1)])) + (vec_select:DF + (match_dup 1) + (parallel [(const_int 0)]))) + (match_dup 1))) + (clobber (match_scratch:V2DF 2 ""))])] + "VECTOR_UNIT_VSX_P (V2DFmode)" + "") + +; The (VEC_reduc:V4SF +; (op1) +; (unspec:V4SF [(const_int 0)] UNSPEC_REDUC)) +; +; is to allow us to use a code iterator, but not completely list all of the +; vector rotates, etc. to prevent canonicalization + +(define_expand "reduc_<VEC_reduc_name>_v4sf" + [(parallel [(set (match_operand:V4SF 0 "vfloat_operand" "") + (VEC_reduc:V4SF + (unspec:V4SF [(const_int 0)] UNSPEC_REDUC) + (match_operand:V4SF 1 "vfloat_operand" ""))) + (clobber (match_scratch:V4SF 2 "")) + (clobber (match_scratch:V4SF 3 ""))])] + "VECTOR_UNIT_VSX_P (V4SFmode)" + "") + + ;;; Expanders for vector insn patterns shared between the SPE and TARGET_PAIRED systems. (define_expand "absv2sf2" Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 171306) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -5492,12 +5492,22 @@ rs6000_expand_vector_extract (rtx target enum machine_mode inner_mode = GET_MODE_INNER (mode); rtx mem; - if (VECTOR_MEM_VSX_P (mode) && (mode == V2DFmode || mode == V2DImode)) + if (VECTOR_MEM_VSX_P (mode)) { - rtx (*extract_func) (rtx, rtx, rtx) - = ((mode == V2DFmode) ? gen_vsx_extract_v2df : gen_vsx_extract_v2di); - emit_insn (extract_func (target, vec, GEN_INT (elt))); - return; + switch (mode) + { + default: + break; + case V2DFmode: + emit_insn (gen_vsx_extract_v2df (target, vec, GEN_INT (elt))); + return; + case V2DImode: + emit_insn (gen_vsx_extract_v2di (target, vec, GEN_INT (elt))); + return; + case V4SFmode: + emit_insn (gen_vsx_extract_v4sf (target, vec, GEN_INT (elt))); + return; + } } /* Allocate mode-sized buffer. */ Index: gcc/config/rs6000/vsx.md =================================================================== --- gcc/config/rs6000/vsx.md (revision 171306) +++ gcc/config/rs6000/vsx.md (working copy) @@ -829,6 +829,15 @@ (define_insn "vsx_xscvdpsp_scalar" "xscvdpsp %x0,%x1" [(set_attr "type" "fp")]) +;; Same as vsx_xscvspdp, but use SF as the type +(define_insn "vsx_xscvspdp_scalar2" + [(set (match_operand:SF 0 "vsx_register_operand" "=f") + (unspec:SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")] + UNSPEC_VSX_CVSPDP))] + "VECTOR_UNIT_VSX_P (DFmode)" + "xscvspdp %x0,%x1" + [(set_attr "type" "fp")]) + ;; Convert from 64-bit to 32-bit types ;; Note, favor the Altivec registers since the usual use of these instructions ;; is in vector converts and we need to use the Altivec vperm instruction. @@ -1039,6 +1048,43 @@ (define_insn "*vsx_extract_<mode>_zero" [(set_attr "type" "fpload") (set_attr "length" "4")]) +;; Extract a SF element from V4SF +(define_insn_and_split "vsx_extract_v4sf" + [(set (match_operand:SF 0 "vsx_register_operand" "=f,f") + (vec_select:SF + (match_operand:V4SF 1 "vsx_register_operand" "wa,wa") + (parallel [(match_operand:QI 2 "u5bit_cint_operand" "O,i")]))) + (clobber (match_scratch:V4SF 3 "=X,0"))] + "VECTOR_UNIT_VSX_P (V4SFmode)" + "@ + xscvspdp %x0,%x1 + #" + "" + [(const_int 0)] + " +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx op2 = operands[2]; + rtx op3 = operands[3]; + rtx tmp; + HOST_WIDE_INT ele = INTVAL (op2); + + if (ele == 0) + tmp = op1; + else + { + if (GET_CODE (op3) == SCRATCH) + op3 = gen_reg_rtx (V4SFmode); + emit_insn (gen_vsx_xxsldwi_v4sf (op3, op1, op1, op2)); + tmp = op3; + } + emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp)); + DONE; +}" + [(set_attr "length" "4,8") + (set_attr "type" "fp")]) + ;; General double word oriented permute, allow the other vector types for ;; optimizing the permute instruction. (define_insn "vsx_xxpermdi_<mode>" @@ -1076,7 +1122,7 @@ (define_insn "*vsx_xxpermdi2_<mode>" (define_insn "vsx_splat_<mode>" [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wd,wd,wd,?wa,?wa,?wa") (vec_duplicate:VSX_D - (match_operand:<VS_scalar> 1 "splat_input_operand" "ws,f,Z,wa,wa,Z")))] + (match_operand:<VS_scalar> 1 "input_operand" "ws,f,Z,wa,wa,Z")))] "VECTOR_MEM_VSX_P (<MODE>mode)" "@ xxpermdi %x0,%x1,%x1,0 @@ -1150,3 +1196,153 @@ (define_insn "vsx_xxsldwi_<mode>" "VECTOR_MEM_VSX_P (<MODE>mode)" "xxsldwi %x0,%x1,%x2,%3" [(set_attr "type" "vecperm")]) + + +;; Vector reduction insns and splitters + +(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v2df" + [(set (match_operand:V2DF 0 "vfloat_operand" "=&wd,&?wa,wd,?wa") + (VEC_reduc:V2DF + (vec_concat:V2DF + (vec_select:DF + (match_operand:V2DF 1 "vfloat_operand" "wd,wa,wd,wa") + (parallel [(const_int 1)])) + (vec_select:DF + (match_dup 1) + (parallel [(const_int 0)]))) + (match_dup 1))) + (clobber (match_scratch:V2DF 2 "=0,0,&wd,&wa"))] + "VECTOR_UNIT_VSX_P (V2DFmode)" + "#" + "" + [(const_int 0)] + " +{ + rtx tmp = (GET_CODE (operands[2]) == SCRATCH) + ? gen_reg_rtx (V2DFmode) + : operands[2]; + emit_insn (gen_vsx_xxsldwi_v2df (tmp, operands[1], operands[1], const2_rtx)); + emit_insn (gen_<VEC_reduc_rtx>v2df3 (operands[0], tmp, operands[1])); + DONE; +}" + [(set_attr "length" "8") + (set_attr "type" "veccomplex")]) + +(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v4sf" + [(set (match_operand:V4SF 0 "vfloat_operand" "=wf,?wa") + (VEC_reduc:V4SF + (unspec:V4SF [(const_int 0)] UNSPEC_REDUC) + (match_operand:V4SF 1 "vfloat_operand" "wf,wa"))) + (clobber (match_scratch:V4SF 2 "=&wf,&wa")) + (clobber (match_scratch:V4SF 3 "=&wf,&wa"))] + "VECTOR_UNIT_VSX_P (V4SFmode)" + "#" + "" + [(const_int 0)] + " +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx tmp2, tmp3, tmp4; + + if (can_create_pseudo_p ()) + { + tmp2 = gen_reg_rtx (V4SFmode); + tmp3 = gen_reg_rtx (V4SFmode); + tmp4 = gen_reg_rtx (V4SFmode); + } + else + { + tmp2 = operands[2]; + tmp3 = operands[3]; + tmp4 = tmp2; + } + + emit_insn (gen_vsx_xxsldwi_v4sf (tmp2, op1, op1, const2_rtx)); + emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp3, tmp2, op1)); + emit_insn (gen_vsx_xxsldwi_v4sf (tmp4, tmp3, tmp3, GEN_INT (3))); + emit_insn (gen_<VEC_reduc_rtx>v4sf3 (op0, tmp4, tmp3)); + DONE; +}" + [(set_attr "length" "16") + (set_attr "type" "veccomplex")]) + +;; Combiner patterns with the vector reduction patterns that knows we can get +;; to the top element of the V2DF array without doing an extract. + +(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v2df_scalar" + [(set (match_operand:DF 0 "vfloat_operand" "=&ws,&?wa,ws,?wa") + (vec_select:DF + (VEC_reduc:V2DF + (vec_concat:V2DF + (vec_select:DF + (match_operand:V2DF 1 "vfloat_operand" "wd,wa,wd,wa") + (parallel [(const_int 1)])) + (vec_select:DF + (match_dup 1) + (parallel [(const_int 0)]))) + (match_dup 1)) + (parallel [(const_int 1)]))) + (clobber (match_scratch:DF 2 "=0,0,&wd,&wa"))] + "VECTOR_UNIT_VSX_P (V2DFmode)" + "#" + "" + [(const_int 0)] + " +{ + rtx hi = gen_highpart (DFmode, operands[1]); + rtx lo = (GET_CODE (operands[2]) == SCRATCH) + ? gen_reg_rtx (DFmode) + : operands[2]; + + emit_insn (gen_vsx_extract_v2df (lo, operands[1], const1_rtx)); + emit_insn (gen_<VEC_reduc_rtx>df3 (operands[0], hi, lo)); + DONE; +}" + [(set_attr "length" "8") + (set_attr "type" "veccomplex")]) + +(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v4sf_scalar" + [(set (match_operand:SF 0 "vfloat_operand" "=f,?f") + (vec_select:SF + (VEC_reduc:V4SF + (unspec:V4SF [(const_int 0)] UNSPEC_REDUC) + (match_operand:V4SF 1 "vfloat_operand" "wf,wa")) + (parallel [(const_int 3)]))) + (clobber (match_scratch:V4SF 2 "=&wf,&wa")) + (clobber (match_scratch:V4SF 3 "=&wf,&wa")) + (clobber (match_scratch:V4SF 4 "=0,0"))] + "VECTOR_UNIT_VSX_P (V4SFmode)" + "#" + "" + [(const_int 0)] + " +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx tmp2, tmp3, tmp4, tmp5; + + if (can_create_pseudo_p ()) + { + tmp2 = gen_reg_rtx (V4SFmode); + tmp3 = gen_reg_rtx (V4SFmode); + tmp4 = gen_reg_rtx (V4SFmode); + tmp5 = gen_reg_rtx (V4SFmode); + } + else + { + tmp2 = operands[2]; + tmp3 = operands[3]; + tmp4 = tmp2; + tmp5 = operands[4]; + } + + emit_insn (gen_vsx_xxsldwi_v4sf (tmp2, op1, op1, const2_rtx)); + emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp3, tmp2, op1)); + emit_insn (gen_vsx_xxsldwi_v4sf (tmp4, tmp3, tmp3, GEN_INT (3))); + emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp5, tmp4, tmp3)); + emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp5)); + DONE; +}" + [(set_attr "length" "20") + (set_attr "type" "veccomplex")])