Attached patch considerably improves zero-extended SImode -> DImode
moves between SSE registers for SSE4.1 targets. The patch teaches the
compiler to generate:

        vmovdqa m(%rip), %ymm1
        vpmovzxdq       %xmm1, %xmm1
        vpsrlw  %xmm1, %xmm0, %xmm0

to zero-extend the value in the SSE register, instead of round
tripping the value to GPR:

        vmovdqa m(%rip), %ymm1
        vmovd   %xmm1, %eax
        vmovq   %rax, %xmm1
        vpsrlw  %xmm1, %xmm0, %xmm0

... or horrible code for targets without preference to inter-unit moves.

As mentioned by Jakub, there are other optimization opportunities with
count argument handling.

2017-04-06  Uros Bizjak  <ubiz...@gmail.com>

    PR target/80286
    * config/i386/sse.md (*vec_extractv4si_0_zext_sse4): New pattern.
    * config/i386/i386.md (*zero_extendsidi2):
    Add (?*x,*x) and (?*v,*v) alternatives.

Patch was bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Committed to mainline SVN.

Uros.
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md (revision 246738)
+++ config/i386/i386.md (working copy)
@@ -3767,10 +3767,10 @@
 
 (define_insn "*zero_extendsidi2"
   [(set (match_operand:DI 0 "nonimmediate_operand"
-                       "=r,?r,?o,r   ,o,?*Ym,?!*y,?r ,?r,?*Yi,?*x,*r")
+               "=r,?r,?o,r   ,o,?*Ym,?!*y,?r ,?r,?*Yi,?*x,?*x,?*v,*r")
        (zero_extend:DI
         (match_operand:SI 1 "x86_64_zext_operand"
-                       "0 ,rm,r ,rmWz,0,r   ,m   ,*Yj,*x,r   ,m  ,*k")))]
+               "0 ,rm,r ,rmWz,0,r   ,m   ,*Yj,*x,r   ,m  , *x, *v,*k")))]
   ""
 {
   switch (get_attr_type (insn))
@@ -3791,6 +3791,15 @@
       return "%vpextrd\t{$0, %1, %k0|%k0, %1, 0}";
 
     case TYPE_SSEMOV:
+      if (SSE_REG_P (operands[0]) && SSE_REG_P (operands[1]))
+       {
+         if (EXT_REX_SSE_REG_P (operands[0])
+             || EXT_REX_SSE_REG_P (operands[1]))
+           return "vpmovzxdq\t{%t1, %g0|%g0, %t1}";
+         else
+           return "%vpmovzxdq\t{%1, %0|%0, %1}";
+       }
+
       if (GENERAL_REG_P (operands[0]))
        return "%vmovd\t{%1, %k0|%k0, %1}";
 
@@ -3813,6 +3822,10 @@
            (eq_attr "alternative" "10")
              (const_string "sse2")
            (eq_attr "alternative" "11")
+             (const_string "sse4")
+           (eq_attr "alternative" "12")
+             (const_string "avx512f")
+           (eq_attr "alternative" "13")
              (const_string "x64_avx512bw")
           ]
           (const_string "*")))
@@ -3821,16 +3834,16 @@
              (const_string "multi")
            (eq_attr "alternative" "5,6")
              (const_string "mmxmov")
-           (eq_attr "alternative" "7,9,10")
+           (eq_attr "alternative" "7,9,10,11,12")
              (const_string "ssemov")
            (eq_attr "alternative" "8")
              (const_string "sselog1")
-           (eq_attr "alternative" "11")
+           (eq_attr "alternative" "13")
              (const_string "mskmov")
           ]
           (const_string "imovx")))
    (set (attr "prefix_extra")
-     (if_then_else (eq_attr "alternative" "8")
+     (if_then_else (eq_attr "alternative" "8,11,12")
        (const_string "1")
        (const_string "*")))
    (set (attr "length_immediate")
@@ -3848,7 +3861,7 @@
    (set (attr "mode")
      (cond [(eq_attr "alternative" "5,6")
              (const_string "DI")
-           (eq_attr "alternative" "7,8,9")
+           (eq_attr "alternative" "7,8,9,11,12")
              (const_string "TI")
           ]
           (const_string "SI")))])
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md  (revision 246738)
+++ config/i386/sse.md  (working copy)
@@ -13516,18 +13516,6 @@
   "#"
   [(set_attr "isa" "*,sse4,*,*")])
 
-(define_insn_and_split "*vec_extractv4si_0_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r")
-       (zero_extend:DI
-         (vec_select:SI
-           (match_operand:V4SI 1 "register_operand" "v")
-           (parallel [(const_int 0)]))))]
-  "TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_FROM_VEC"
-  "#"
-  "&& reload_completed"
-  [(set (match_dup 0) (zero_extend:DI (match_dup 1)))]
-  "operands[1] = gen_lowpart (SImode, operands[1]);")
-
 (define_insn "*vec_extractv2di_0_sse"
   [(set (match_operand:DI 0 "nonimmediate_operand"     "=v,m")
        (vec_select:DI
@@ -13546,6 +13534,35 @@
   [(set (match_dup 0) (match_dup 1))]
   "operands[1] = gen_lowpart (<MODE>mode, operands[1]);")
 
+(define_insn "*vec_extractv4si_0_zext_sse4"
+  [(set (match_operand:DI 0 "register_operand" "=r,x,v")
+       (zero_extend:DI
+         (vec_select:SI
+           (match_operand:V4SI 1 "register_operand" "Yj,x,v")
+           (parallel [(const_int 0)]))))]
+  "TARGET_SSE4_1"
+  "#"
+  [(set_attr "isa" "x64,*,avx512f")])
+
+(define_insn "*vec_extractv4si_0_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+       (zero_extend:DI
+         (vec_select:SI
+           (match_operand:V4SI 1 "register_operand" "x")
+           (parallel [(const_int 0)]))))]
+  "TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_FROM_VEC"
+  "#")
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+       (zero_extend:DI
+         (vec_select:SI
+           (match_operand:V4SI 1 "register_operand")
+           (parallel [(const_int 0)]))))]
+  "TARGET_SSE2 && reload_completed"
+  [(set (match_dup 0) (zero_extend:DI (match_dup 1)))]
+  "operands[1] = gen_lowpart (SImode, operands[1]);")
+
 (define_insn "*vec_extractv4si"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,rm,Yr,*x,x,Yv")
        (vec_select:SI

Reply via email to