Move V4SF to V4SI, init vector like V4SI and move to V4SF back. Better instruction sequence could be generated on Power9:
lfs + xxpermdi + xvcvdpsp + vmrgew => lwz + (sldi + or) + mtvsrdd With the patch followed, it could be continue optimized to: lwz + rldimi + mtvsrdd The point is to use lwz to avoid converting the single-precision to double-precision upon load, pack four 32-bit data into one 128-bit register directly. gcc/ChangeLog: 2020-07-10 Xionghu Luo <luo...@linux.ibm.com> * config/rs6000/rs6000.c (rs6000_expand_vector_init): Move V4SF to V4SI, init vector like V4SI and move to V4SF back. --- gcc/config/rs6000/rs6000.c | 49 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 58f5d780603..d94e88c23a5 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -6423,35 +6423,34 @@ rs6000_expand_vector_init (rtx target, rtx vals) } else { - rtx dbl_even = gen_reg_rtx (V2DFmode); - rtx dbl_odd = gen_reg_rtx (V2DFmode); - rtx flt_even = gen_reg_rtx (V4SFmode); - rtx flt_odd = gen_reg_rtx (V4SFmode); - rtx op0 = force_reg (SFmode, XVECEXP (vals, 0, 0)); - rtx op1 = force_reg (SFmode, XVECEXP (vals, 0, 1)); - rtx op2 = force_reg (SFmode, XVECEXP (vals, 0, 2)); - rtx op3 = force_reg (SFmode, XVECEXP (vals, 0, 3)); - - /* Use VMRGEW if we can instead of doing a permute. */ - if (TARGET_P8_VECTOR) + rtx tmpSF[4]; + rtx tmpSI[4]; + rtx tmpDI[4]; + rtx mrgDI[4]; + for (i = 0; i < 4; i++) { - emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op2)); - emit_insn (gen_vsx_concat_v2sf (dbl_odd, op1, op3)); - emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even)); - emit_insn (gen_vsx_xvcvdpsp (flt_odd, dbl_odd)); - if (BYTES_BIG_ENDIAN) - emit_insn (gen_p8_vmrgew_v4sf_direct (target, flt_even, flt_odd)); - else - emit_insn (gen_p8_vmrgew_v4sf_direct (target, flt_odd, flt_even)); + tmpSI[i] = gen_reg_rtx (SImode); + tmpDI[i] = gen_reg_rtx (DImode); + mrgDI[i] = gen_reg_rtx (DImode); + tmpSF[i] = force_reg (SFmode, XVECEXP (vals, 0, i)); + emit_insn (gen_movsi_from_sf (tmpSI[i], tmpSF[i])); + emit_insn (gen_zero_extendsidi2 (tmpDI[i], tmpSI[i])); } - else + + if (!BYTES_BIG_ENDIAN) { - emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op1)); - emit_insn (gen_vsx_concat_v2sf (dbl_odd, op2, op3)); - emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even)); - emit_insn (gen_vsx_xvcvdpsp (flt_odd, dbl_odd)); - rs6000_expand_extract_even (target, flt_even, flt_odd); + std::swap (tmpDI[0], tmpDI[1]); + std::swap (tmpDI[2], tmpDI[3]); } + + emit_insn (gen_ashldi3 (mrgDI[0], tmpDI[0], GEN_INT (32))); + emit_insn (gen_iordi3 (mrgDI[1], mrgDI[0], tmpDI[1])); + emit_insn (gen_ashldi3 (mrgDI[2], tmpDI[2], GEN_INT (32))); + emit_insn (gen_iordi3 (mrgDI[3], mrgDI[2], tmpDI[3])); + + rtx tmpV2DI = gen_reg_rtx (V2DImode); + emit_insn (gen_vsx_concat_v2di (tmpV2DI, mrgDI[1], mrgDI[3])); + emit_move_insn (target, gen_lowpart (V4SFmode, tmpV2DI)); } return; } -- 2.27.0.90.geebb51ba8c