On Tue, Nov 15, 2011 at 8:23 PM, Uros Bizjak <ubiz...@gmail.com> wrote:

> Attached patch optimizes  v2df (x2) -> v4sf,v4si conversion sequences
> for AVX from:

>
>        vroundpd        $1, 32(%rsp), %xmm1
>        vroundpd        $1, 48(%rsp), %xmm0
>        vcvttpd2dqx     %xmm1, %xmm1
>        vcvttpd2dqx     %xmm0, %xmm0
>        vpunpcklqdq     %xmm0, %xmm1, %xmm0
>        vmovdqa %xmm0, 16(%rsp)
>
> to
>
>        vroundpd        $1, 64(%rsp), %xmm1
>        vroundpd        $1, 80(%rsp), %xmm0
>        vinsertf128     $0x1, %xmm0, %ymm1, %ymm0
>        vcvttpd2dqy     %ymm0, %xmm0
>        vmovdqa %xmm0, 32(%rsp)
>
> Ideally, this would be just "vcvtpd2psy 64(%rsp), %xmm0" or "vroundpd
> $1, 64(%rsp), %ymm1", but vectorizer does not (yet) support mixed
> vectorize factors.

Attached patch optimizes above code a step further, generating:

        vmovapd 64(%rsp), %xmm0
        vinsertf128     $0x1, 80(%rsp), %ymm0, %ymm0
        vroundpd        $1, %ymm0, %ymm0
        vcvttpd2dqy     %ymm0, %xmm0
        vmovdqa %xmm0, 32(%rsp)

2011-11-16  Uros Bizjak  <ubiz...@gmail.com>

        * config/i386/sse.md (round<mode>2_vec_pack_sfix): Optimize V2DFmode
        sequence for AVX.
        (<sse4_1>_round<ssemodesuffix>_vec_pack_sfix<avxsizesuffix>): Ditto.

Tested on x86_64-pc-linux-gnu {,-m32} AVX target, committed to mainline SVN.

Uros.
Index: sse.md
===================================================================
--- sse.md      (revision 181402)
+++ sse.md      (working copy)
@@ -9962,17 +9962,32 @@
 {
   rtx tmp0, tmp1;
 
-  tmp0 = gen_reg_rtx (<MODE>mode);
-  tmp1 = gen_reg_rtx (<MODE>mode);
+  if (<MODE>mode == V2DFmode
+      && TARGET_AVX && !TARGET_PREFER_AVX128)
+    {
+      rtx tmp2 = gen_reg_rtx (V4DFmode);
 
-  emit_insn
-    (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp0, operands[1],
-                                                      operands[3]));
-  emit_insn
-    (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp1, operands[2],
-                                                      operands[3]));
-  emit_insn
-    (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+      tmp0 = gen_reg_rtx (V4DFmode);
+      tmp1 = force_reg (V2DFmode, operands[1]);
+
+      emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+      emit_insn (gen_avx_roundpd256 (tmp2, tmp0, operands[3]));
+      emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp2));
+    }
+  else
+    {
+      tmp0 = gen_reg_rtx (<MODE>mode);
+      tmp1 = gen_reg_rtx (<MODE>mode);
+
+      emit_insn
+       (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp0, operands[1],
+                                                         operands[3]));
+      emit_insn
+       (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp1, operands[2],
+                                                         operands[3]));
+      emit_insn
+       (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+    }
   DONE;
 })
 
@@ -10053,14 +10068,29 @@
 {
   rtx tmp0, tmp1;
 
-  tmp0 = gen_reg_rtx (<MODE>mode);
-  tmp1 = gen_reg_rtx (<MODE>mode);
+  if (<MODE>mode == V2DFmode
+      && TARGET_AVX && !TARGET_PREFER_AVX128)
+    {
+      rtx tmp2 = gen_reg_rtx (V4DFmode);
 
-  emit_insn (gen_round<mode>2 (tmp0, operands[1]));
-  emit_insn (gen_round<mode>2 (tmp1, operands[2]));
+      tmp0 = gen_reg_rtx (V4DFmode);
+      tmp1 = force_reg (V2DFmode, operands[1]);
 
-  emit_insn
-    (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+      emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+      emit_insn (gen_roundv4df2 (tmp2, tmp0));
+      emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp2));
+    }
+  else
+    {
+      tmp0 = gen_reg_rtx (<MODE>mode);
+      tmp1 = gen_reg_rtx (<MODE>mode);
+
+      emit_insn (gen_round<mode>2 (tmp0, operands[1]));
+      emit_insn (gen_round<mode>2 (tmp1, operands[2]));
+
+      emit_insn
+       (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+    }
   DONE;
 })
 

Reply via email to