The original VSX instruction set did not have a simple way to merge the two vectors with the upper words set in each double word that hold the float values in V4SFmode format after the XVCVDPSP instructions. So, the code used a VPERM instruction to reorder the parts.
vector float combine (float a, float b, float c, float d) { return (vector float) { a, b, c, d }; } Would generate: xxpermdi 34,1,2,0 addis 9,2,.LC0@toc@ha xxpermdi 32,3,4,0 addi 9,9,.LC0@toc@l lxvw4x 33,0,9 xvcvdpsp 34,34 xvcvdpsp 32,32 vperm 2,2,0,1 # ... .LC0: .byte 0 .byte 1 .byte 2 .byte 3 .byte 8 .byte 9 .byte 10 .byte 11 .byte 16 .byte 17 .byte 18 .byte 19 .byte 24 .byte 25 .byte 26 .byte 27 However ISA 2.07 (i.e. power8) added the VMRGEW instruction, which can do this more simply: xxpermdi 34,1,2,0 xxpermdi 32,3,4,0 xvcvdpsp 34,34 xvcvdpsp 32,32 vmrgew 2,2,0 I also built Spec 2006 with the compiler, and 4 benchmarks generate the new sequences (gromacs, dealII, hmmer, and wrf). I tested gromacs, dealII, hmmer and I didn't see any changes in execution time. This patch adds support to use the VMRGEW instruction on ISA 2.07 and above. I did bootstrap builds on both big endian power8 and little endian power8 and there were no regressions. Is this patch ok to check into the trunk? 2016-09-19 Michael Meissner <meiss...@linux.vnet.ibm.com> * config/rs6000/rs6000.c (rs6000_expand_vector_init): For V4SF inits on power8 and above, use the VMRGEW instruction instead of a permute. * config/rs6000/altivec.md (UNSPEC_VMRGEW_DIRECT): New unspec. (p8_vmrgew_v4sf_direct): New VMRGEW insn for V4SF floating initialization. -- Michael Meissner, IBM IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA email: meiss...@linux.vnet.ibm.com, phone: +1 (978) 899-4797
Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (.../svn+ssh://meiss...@gcc.gnu.org/svn/gcc/trunk/gcc/config/rs6000) (revision 240142) +++ gcc/config/rs6000/rs6000.c (.../gcc/config/rs6000) (working copy) @@ -6821,11 +6821,26 @@ rs6000_expand_vector_init (rtx target, r rtx op2 = force_reg (SFmode, XVECEXP (vals, 0, 2)); rtx op3 = force_reg (SFmode, XVECEXP (vals, 0, 3)); - emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op1)); - emit_insn (gen_vsx_concat_v2sf (dbl_odd, op2, op3)); - emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even)); - emit_insn (gen_vsx_xvcvdpsp (flt_odd, dbl_odd)); - rs6000_expand_extract_even (target, flt_even, flt_odd); + /* Use VMRGEW if we can instead of doing a permute. */ + if (TARGET_P8_VECTOR) + { + emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op2)); + emit_insn (gen_vsx_concat_v2sf (dbl_odd, op1, op3)); + emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even)); + emit_insn (gen_vsx_xvcvdpsp (flt_odd, dbl_odd)); + if (BYTES_BIG_ENDIAN) + emit_insn (gen_p8_vmrgew_v4sf_direct (target, flt_even, flt_odd)); + else + emit_insn (gen_p8_vmrgew_v4sf_direct (target, flt_odd, flt_even)); + } + else + { + emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op1)); + emit_insn (gen_vsx_concat_v2sf (dbl_odd, op2, op3)); + emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even)); + emit_insn (gen_vsx_xvcvdpsp (flt_odd, dbl_odd)); + rs6000_expand_extract_even (target, flt_even, flt_odd); + } } return; } Index: gcc/config/rs6000/altivec.md =================================================================== --- gcc/config/rs6000/altivec.md (.../svn+ssh://meiss...@gcc.gnu.org/svn/gcc/trunk/gcc/config/rs6000) (revision 240142) +++ gcc/config/rs6000/altivec.md (.../gcc/config/rs6000) (working copy) @@ -141,6 +141,7 @@ (define_c_enum "unspec" UNSPEC_VMRGH_DIRECT UNSPEC_VMRGL_DIRECT UNSPEC_VSPLT_DIRECT + UNSPEC_VMRGEW_DIRECT UNSPEC_VSUMSWS_DIRECT UNSPEC_VADDCUQ UNSPEC_VADDEUQM @@ -1340,6 +1341,15 @@ (define_insn "p8_vmrgow" } [(set_attr "type" "vecperm")]) +(define_insn "p8_vmrgew_v4sf_direct" + [(set (match_operand:V4SF 0 "register_operand" "=v") + (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v") + (match_operand:V4SF 2 "register_operand" "v")] + UNSPEC_VMRGEW_DIRECT))] + "TARGET_P8_VECTOR" + "vmrgew %0,%1,%2" + [(set_attr "type" "vecperm")]) + (define_expand "vec_widen_umult_even_v16qi" [(use (match_operand:V8HI 0 "register_operand" "")) (use (match_operand:V16QI 1 "register_operand" ""))