Move V4SF to V4SI, init vector like V4SI and move to V4SF back.
Better instruction sequence could be generated on Power9:

lfs + xxpermdi + xvcvdpsp + vmrgew
=>
lwz + (sldi + or) + mtvsrdd

With the patch followed, it could be continue optimized to:

lwz + rldimi + mtvsrdd

The point is to use lwz to avoid converting the single-precision to
double-precision upon load, pack four 32-bit data into one 128-bit
register directly.

gcc/ChangeLog:

2020-07-10  Xionghu Luo  <luo...@linux.ibm.com>

        * config/rs6000/rs6000.c (rs6000_expand_vector_init):
        Move V4SF to V4SI, init vector like V4SI and move to V4SF back.
---
 gcc/config/rs6000/rs6000.c | 49 +++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 58f5d780603..d94e88c23a5 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6423,35 +6423,34 @@ rs6000_expand_vector_init (rtx target, rtx vals)
        }
       else
        {
-         rtx dbl_even = gen_reg_rtx (V2DFmode);
-         rtx dbl_odd  = gen_reg_rtx (V2DFmode);
-         rtx flt_even = gen_reg_rtx (V4SFmode);
-         rtx flt_odd  = gen_reg_rtx (V4SFmode);
-         rtx op0 = force_reg (SFmode, XVECEXP (vals, 0, 0));
-         rtx op1 = force_reg (SFmode, XVECEXP (vals, 0, 1));
-         rtx op2 = force_reg (SFmode, XVECEXP (vals, 0, 2));
-         rtx op3 = force_reg (SFmode, XVECEXP (vals, 0, 3));
-
-         /* Use VMRGEW if we can instead of doing a permute.  */
-         if (TARGET_P8_VECTOR)
+         rtx tmpSF[4];
+         rtx tmpSI[4];
+         rtx tmpDI[4];
+         rtx mrgDI[4];
+         for (i = 0; i < 4; i++)
            {
-             emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op2));
-             emit_insn (gen_vsx_concat_v2sf (dbl_odd, op1, op3));
-             emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even));
-             emit_insn (gen_vsx_xvcvdpsp (flt_odd, dbl_odd));
-             if (BYTES_BIG_ENDIAN)
-               emit_insn (gen_p8_vmrgew_v4sf_direct (target, flt_even, 
flt_odd));
-             else
-               emit_insn (gen_p8_vmrgew_v4sf_direct (target, flt_odd, 
flt_even));
+             tmpSI[i] = gen_reg_rtx (SImode);
+             tmpDI[i] = gen_reg_rtx (DImode);
+             mrgDI[i] = gen_reg_rtx (DImode);
+             tmpSF[i] = force_reg (SFmode, XVECEXP (vals, 0, i));
+             emit_insn (gen_movsi_from_sf (tmpSI[i], tmpSF[i]));
+             emit_insn (gen_zero_extendsidi2 (tmpDI[i], tmpSI[i]));
            }
-         else
+
+         if (!BYTES_BIG_ENDIAN)
            {
-             emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op1));
-             emit_insn (gen_vsx_concat_v2sf (dbl_odd, op2, op3));
-             emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even));
-             emit_insn (gen_vsx_xvcvdpsp (flt_odd, dbl_odd));
-             rs6000_expand_extract_even (target, flt_even, flt_odd);
+             std::swap (tmpDI[0], tmpDI[1]);
+             std::swap (tmpDI[2], tmpDI[3]);
            }
+
+         emit_insn (gen_ashldi3 (mrgDI[0], tmpDI[0], GEN_INT (32)));
+         emit_insn (gen_iordi3 (mrgDI[1], mrgDI[0], tmpDI[1]));
+         emit_insn (gen_ashldi3 (mrgDI[2], tmpDI[2], GEN_INT (32)));
+         emit_insn (gen_iordi3 (mrgDI[3], mrgDI[2], tmpDI[3]));
+
+         rtx tmpV2DI = gen_reg_rtx (V2DImode);
+         emit_insn (gen_vsx_concat_v2di (tmpV2DI, mrgDI[1], mrgDI[3]));
+         emit_move_insn (target, gen_lowpart (V4SFmode, tmpV2DI));
        }
       return;
     }
-- 
2.27.0.90.geebb51ba8c

Reply via email to