On Tue, Nov 15, 2011 at 8:23 PM, Uros Bizjak <ubiz...@gmail.com> wrote:
> Attached patch optimizes v2df (x2) -> v4sf,v4si conversion sequences > for AVX from: > > vroundpd $1, 32(%rsp), %xmm1 > vroundpd $1, 48(%rsp), %xmm0 > vcvttpd2dqx %xmm1, %xmm1 > vcvttpd2dqx %xmm0, %xmm0 > vpunpcklqdq %xmm0, %xmm1, %xmm0 > vmovdqa %xmm0, 16(%rsp) > > to > > vroundpd $1, 64(%rsp), %xmm1 > vroundpd $1, 80(%rsp), %xmm0 > vinsertf128 $0x1, %xmm0, %ymm1, %ymm0 > vcvttpd2dqy %ymm0, %xmm0 > vmovdqa %xmm0, 32(%rsp) > > Ideally, this would be just "vcvtpd2psy 64(%rsp), %xmm0" or "vroundpd > $1, 64(%rsp), %ymm1", but vectorizer does not (yet) support mixed > vectorize factors. Attached patch optimizes above code a step further, generating: vmovapd 64(%rsp), %xmm0 vinsertf128 $0x1, 80(%rsp), %ymm0, %ymm0 vroundpd $1, %ymm0, %ymm0 vcvttpd2dqy %ymm0, %xmm0 vmovdqa %xmm0, 32(%rsp) 2011-11-16 Uros Bizjak <ubiz...@gmail.com> * config/i386/sse.md (round<mode>2_vec_pack_sfix): Optimize V2DFmode sequence for AVX. (<sse4_1>_round<ssemodesuffix>_vec_pack_sfix<avxsizesuffix>): Ditto. Tested on x86_64-pc-linux-gnu {,-m32} AVX target, committed to mainline SVN. Uros.
Index: sse.md =================================================================== --- sse.md (revision 181402) +++ sse.md (working copy) @@ -9962,17 +9962,32 @@ { rtx tmp0, tmp1; - tmp0 = gen_reg_rtx (<MODE>mode); - tmp1 = gen_reg_rtx (<MODE>mode); + if (<MODE>mode == V2DFmode + && TARGET_AVX && !TARGET_PREFER_AVX128) + { + rtx tmp2 = gen_reg_rtx (V4DFmode); - emit_insn - (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp0, operands[1], - operands[3])); - emit_insn - (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp1, operands[2], - operands[3])); - emit_insn - (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1)); + tmp0 = gen_reg_rtx (V4DFmode); + tmp1 = force_reg (V2DFmode, operands[1]); + + emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2])); + emit_insn (gen_avx_roundpd256 (tmp2, tmp0, operands[3])); + emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp2)); + } + else + { + tmp0 = gen_reg_rtx (<MODE>mode); + tmp1 = gen_reg_rtx (<MODE>mode); + + emit_insn + (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp0, operands[1], + operands[3])); + emit_insn + (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp1, operands[2], + operands[3])); + emit_insn + (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1)); + } DONE; }) @@ -10053,14 +10068,29 @@ { rtx tmp0, tmp1; - tmp0 = gen_reg_rtx (<MODE>mode); - tmp1 = gen_reg_rtx (<MODE>mode); + if (<MODE>mode == V2DFmode + && TARGET_AVX && !TARGET_PREFER_AVX128) + { + rtx tmp2 = gen_reg_rtx (V4DFmode); - emit_insn (gen_round<mode>2 (tmp0, operands[1])); - emit_insn (gen_round<mode>2 (tmp1, operands[2])); + tmp0 = gen_reg_rtx (V4DFmode); + tmp1 = force_reg (V2DFmode, operands[1]); - emit_insn - (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1)); + emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2])); + emit_insn (gen_roundv4df2 (tmp2, tmp0)); + emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp2)); + } + else + { + tmp0 = gen_reg_rtx (<MODE>mode); + tmp1 = gen_reg_rtx (<MODE>mode); + + emit_insn (gen_round<mode>2 (tmp0, operands[1])); + emit_insn (gen_round<mode>2 (tmp1, operands[2])); + + emit_insn + (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1)); + } DONE; })