On Thu, Feb 14, 2019 at 1:30 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > Emulate MMX sse_cvtpi2ps with SSE2 cvtdq2ps, preserving upper 64 bits of > destination XMM register. Only SSE register source operand is allowed. > > PR target/89021 > * config/i386/mmx.md (sse_cvtpi2ps): Renamed to ... > (*mmx_cvtpi2ps): This. Disabled for TARGET_MMX_WITH_SSE. > (sse_cvtpi2ps): New. > (mmx_cvtpi2ps_sse): Likewise.
Now you can merge both instructions together using: (clobber (match_scratch:V4SF 3 "=X,x,Yv")) Please note "X" for the original case where scratch is not needed. Uros. > --- > gcc/config/i386/sse.md | 77 ++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 75 insertions(+), 2 deletions(-) > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 083f9ef0f44..b1bab15af41 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -4561,14 +4561,87 @@ > ;; > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; > > -(define_insn "sse_cvtpi2ps" > +(define_expand "sse_cvtpi2ps" > + [(set (match_operand:V4SF 0 "register_operand") > + (vec_merge:V4SF > + (vec_duplicate:V4SF > + (float:V2SF (match_operand:V2SI 2 "nonimmediate_operand"))) > + (match_operand:V4SF 1 "register_operand") > + (const_int 3)))] > + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSE" > +{ > + if (TARGET_MMX_WITH_SSE) > + { > + rtx op2 = force_reg (V2SImode, operands[2]); > + emit_insn (gen_mmx_cvtpi2ps_sse (operands[0], operands[1], op2)); > + DONE; > + } > +}) > + > +(define_insn_and_split "mmx_cvtpi2ps_sse" > + [(set (match_operand:V4SF 0 "register_operand" "=x,Yv") > + (vec_merge:V4SF > + (vec_duplicate:V4SF > + (float:V2SF (match_operand:V2SI 2 "register_operand" "x,Yv"))) > + (match_operand:V4SF 1 "register_operand" "0,Yv") > + (const_int 3))) > + (clobber (match_scratch:V4SF 3 "=x,Yv"))] > + "TARGET_MMX_WITH_SSE" > + "#" > + "&& reload_completed" > + [(const_int 0)] > +{ > + rtx op2 = lowpart_subreg (V4SImode, operands[2], > + GET_MODE (operands[2])); > + /* Generate SSE2 cvtdq2ps. */ > + rtx insn = gen_floatv4siv4sf2 (operands[3], op2); > + emit_insn (insn); > + > + /* Merge operands[3] with operands[0]. */ > + rtx mask, op1; > + if (TARGET_AVX) > + { > + mask = gen_rtx_PARALLEL (VOIDmode, > + gen_rtvec (4, GEN_INT (0), GEN_INT (1), > + GEN_INT (6), GEN_INT (7))); > + op1 = gen_rtx_VEC_CONCAT (V8SFmode, operands[3], operands[1]); > + op2 = gen_rtx_VEC_SELECT (V4SFmode, op1, mask); > + insn = gen_rtx_SET (operands[0], op2); > + } > + else > + { > + /* NB: SSE can only concatenate OP0 and OP3 to OP0. */ > + mask = gen_rtx_PARALLEL (VOIDmode, > + gen_rtvec (4, GEN_INT (2), GEN_INT (3), > + GEN_INT (4), GEN_INT (5))); > + op1 = gen_rtx_VEC_CONCAT (V8SFmode, operands[0], operands[3]); > + op2 = gen_rtx_VEC_SELECT (V4SFmode, op1, mask); > + insn = gen_rtx_SET (operands[0], op2); > + emit_insn (insn); > + > + /* Swap bits 0:63 with bits 64:127. */ > + mask = gen_rtx_PARALLEL (VOIDmode, > + gen_rtvec (4, GEN_INT (2), GEN_INT (3), > + GEN_INT (0), GEN_INT (1))); > + rtx dest = gen_rtx_REG (V4SImode, REGNO (operands[0])); > + op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask); > + insn = gen_rtx_SET (dest, op1); > + } > + emit_insn (insn); > + DONE; > +} > + [(set_attr "isa" "noavx,avx") > + (set_attr "type" "ssecvt") > + (set_attr "mode" "V4SF")]) > + > +(define_insn "*mmx_cvtpi2ps" > [(set (match_operand:V4SF 0 "register_operand" "=x") > (vec_merge:V4SF > (vec_duplicate:V4SF > (float:V2SF (match_operand:V2SI 2 "nonimmediate_operand" "ym"))) > (match_operand:V4SF 1 "register_operand" "0") > (const_int 3)))] > - "TARGET_SSE" > + "TARGET_SSE && !TARGET_MMX_WITH_SSE" > "cvtpi2ps\t{%2, %0|%0, %2}" > [(set_attr "type" "ssecvt") > (set_attr "mode" "V4SF")]) > -- > 2.20.1 >