The original VSX instruction set did not have a simple way to merge the two
vectors with the upper words set in each double word that hold the float values
in V4SFmode format after the XVCVDPSP instructions.  So, the code used a VPERM
instruction to reorder the parts.

vector float combine (float a, float b, float c, float d)
{
  return (vector float) { a, b, c, d };
}

Would generate:

        xxpermdi 34,1,2,0
        addis 9,2,.LC0@toc@ha
        xxpermdi 32,3,4,0
        addi 9,9,.LC0@toc@l
        lxvw4x 33,0,9
        xvcvdpsp 34,34
        xvcvdpsp 32,32
        vperm 2,2,0,1

        # ...

.LC0:
        .byte   0
        .byte   1
        .byte   2
        .byte   3
        .byte   8
        .byte   9
        .byte   10
        .byte   11
        .byte   16
        .byte   17
        .byte   18
        .byte   19
        .byte   24
        .byte   25
        .byte   26
        .byte   27

However ISA 2.07 (i.e. power8) added the VMRGEW instruction, which can do this
more simply:

        xxpermdi 34,1,2,0
        xxpermdi 32,3,4,0
        xvcvdpsp 34,34
        xvcvdpsp 32,32
        vmrgew 2,2,0

I also built Spec 2006 with the compiler, and 4 benchmarks generate the new
sequences (gromacs, dealII, hmmer, and wrf).  I tested gromacs, dealII, hmmer
and I didn't see any changes in execution time.

This patch adds support to use the VMRGEW instruction on ISA 2.07 and above.  I
did bootstrap builds on both big endian power8 and little endian power8 and
there were no regressions.  Is this patch ok to check into the trunk?

2016-09-19  Michael Meissner  <meiss...@linux.vnet.ibm.com>

        * config/rs6000/rs6000.c (rs6000_expand_vector_init): For V4SF
        inits on power8 and above, use the VMRGEW instruction instead of a
        permute.

        * config/rs6000/altivec.md (UNSPEC_VMRGEW_DIRECT): New unspec.
        (p8_vmrgew_v4sf_direct): New VMRGEW insn for V4SF floating
        initialization.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meiss...@linux.vnet.ibm.com, phone: +1 (978) 899-4797
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c  
(.../svn+ssh://meiss...@gcc.gnu.org/svn/gcc/trunk/gcc/config/rs6000)    
(revision 240142)
+++ gcc/config/rs6000/rs6000.c  (.../gcc/config/rs6000) (working copy)
@@ -6821,11 +6821,26 @@ rs6000_expand_vector_init (rtx target, r
          rtx op2 = force_reg (SFmode, XVECEXP (vals, 0, 2));
          rtx op3 = force_reg (SFmode, XVECEXP (vals, 0, 3));
 
-         emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op1));
-         emit_insn (gen_vsx_concat_v2sf (dbl_odd, op2, op3));
-         emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even));
-         emit_insn (gen_vsx_xvcvdpsp (flt_odd, dbl_odd));
-         rs6000_expand_extract_even (target, flt_even, flt_odd);
+         /* Use VMRGEW if we can instead of doing a permute.  */
+         if (TARGET_P8_VECTOR)
+           {
+             emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op2));
+             emit_insn (gen_vsx_concat_v2sf (dbl_odd, op1, op3));
+             emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even));
+             emit_insn (gen_vsx_xvcvdpsp (flt_odd, dbl_odd));
+             if (BYTES_BIG_ENDIAN)
+               emit_insn (gen_p8_vmrgew_v4sf_direct (target, flt_even, 
flt_odd));
+             else
+               emit_insn (gen_p8_vmrgew_v4sf_direct (target, flt_odd, 
flt_even));
+           }
+         else
+           {
+             emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op1));
+             emit_insn (gen_vsx_concat_v2sf (dbl_odd, op2, op3));
+             emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even));
+             emit_insn (gen_vsx_xvcvdpsp (flt_odd, dbl_odd));
+             rs6000_expand_extract_even (target, flt_even, flt_odd);
+           }
        }
       return;
     }
Index: gcc/config/rs6000/altivec.md
===================================================================
--- gcc/config/rs6000/altivec.md        
(.../svn+ssh://meiss...@gcc.gnu.org/svn/gcc/trunk/gcc/config/rs6000)    
(revision 240142)
+++ gcc/config/rs6000/altivec.md        (.../gcc/config/rs6000) (working copy)
@@ -141,6 +141,7 @@ (define_c_enum "unspec"
    UNSPEC_VMRGH_DIRECT
    UNSPEC_VMRGL_DIRECT
    UNSPEC_VSPLT_DIRECT
+   UNSPEC_VMRGEW_DIRECT
    UNSPEC_VSUMSWS_DIRECT
    UNSPEC_VADDCUQ
    UNSPEC_VADDEUQM
@@ -1340,6 +1341,15 @@ (define_insn "p8_vmrgow"
 }
   [(set_attr "type" "vecperm")])
 
+(define_insn "p8_vmrgew_v4sf_direct"
+  [(set (match_operand:V4SF 0 "register_operand" "=v")
+       (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")
+                     (match_operand:V4SF 2 "register_operand" "v")]
+                    UNSPEC_VMRGEW_DIRECT))]
+  "TARGET_P8_VECTOR"
+  "vmrgew %0,%1,%2"
+  [(set_attr "type" "vecperm")])
+
 (define_expand "vec_widen_umult_even_v16qi"
   [(use (match_operand:V8HI 0 "register_operand" ""))
    (use (match_operand:V16QI 1 "register_operand" ""))

Reply via email to