Hello!

Attached patch avoids a deficiency in reload, where reload gives up on
handling subregs of pseudos (please see the PR  [1] for explanation by
Ulrich). The patch simply avoids generating V4SF moves with V4SF
subregs of V2DF values unless really necessary (i.e. moving SSE2 modes
without SSE2 enabled, which shouldn't happen anyway). With patched
gcc, expand pass emits (unaligned) moves in their original mode, and
this mode is kept until asm is generated. The asm instruction is
chosen according to the mode of insn pattern, and the mode is
calculated using various influencing conditions.

2012-05-09  Uros Bizjak  <ubiz...@gmail.com>

        PR target/44141
        * config/i386/i386.c (ix86_expand_vector_move_misalign): Do not handle
        128 bit vectors specially for TARGET_AVX.  Emit sse2_movupd and
        sse_movupd RTXes for TARGET_AVX, TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
        or when optimizing for size.
        * config/i386/sse.md (*mov<mode>_internal): Remove
        TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL handling from asm output code.
        Calculate "mode" attribute according to optimize_function_for_size_p
        and TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL flag.
        (*<sse>_movu<ssemodesuffix><avxsizesuffix>): Choose asm template
        depending on the mode of the instruction.  Calculate "mode" attribute
        according to optimize_function_for_size_p, TARGET_SSE_TYPELESS_STORES
        and TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL flags.
        (*<sse2>_movdqu<avxsizesuffix>): Ditto.

Patch was bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32}.

The patch also fixes the testcase from  the PR.

Patch will be committed to mainline SVN.

[1] http://gcc.gnu.org/bugzilla/show_bug.cgi?id=44141#c16

Uros.
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md  (revision 187286)
+++ config/i386/sse.md  (working copy)
@@ -449,8 +449,6 @@
              && (misaligned_operand (operands[0], <MODE>mode)
                  || misaligned_operand (operands[1], <MODE>mode)))
            return "vmovupd\t{%1, %0|%0, %1}";
-         else if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
-           return "%vmovaps\t{%1, %0|%0, %1}";
          else
            return "%vmovapd\t{%1, %0|%0, %1}";
 
@@ -460,8 +458,6 @@
              && (misaligned_operand (operands[0], <MODE>mode)
                  || misaligned_operand (operands[1], <MODE>mode)))
            return "vmovdqu\t{%1, %0|%0, %1}";
-         else if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
-           return "%vmovaps\t{%1, %0|%0, %1}";
          else
            return "%vmovdqa\t{%1, %0|%0, %1}";
 
@@ -475,19 +471,21 @@
   [(set_attr "type" "sselog1,ssemov,ssemov")
    (set_attr "prefix" "maybe_vex")
    (set (attr "mode")
-       (cond [(match_test "TARGET_AVX")
+       (cond [(and (eq_attr "alternative" "1,2")
+                   (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
+                (if_then_else
+                   (match_test "GET_MODE_SIZE (<MODE>mode) > 16")
+                   (const_string "V8SF")
+                   (const_string "V4SF"))
+              (match_test "TARGET_AVX")
                 (const_string "<sseinsnmode>")
-              (ior (ior (match_test "optimize_function_for_size_p (cfun)")
-                        (not (match_test "TARGET_SSE2")))
+              (ior (and (eq_attr "alternative" "1,2")
+                        (match_test "optimize_function_for_size_p (cfun)"))
                    (and (eq_attr "alternative" "2")
                         (match_test "TARGET_SSE_TYPELESS_STORES")))
                 (const_string "V4SF")
-              (eq (const_string "<MODE>mode") (const_string "V4SFmode"))
-                (const_string "V4SF")
-              (eq (const_string "<MODE>mode") (const_string "V2DFmode"))
-                (const_string "V2DF")
              ]
-         (const_string "TI")))])
+         (const_string "<sseinsnmode>")))])
 
 (define_insn "sse2_movq128"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
@@ -597,11 +595,33 @@
          [(match_operand:VF 1 "nonimmediate_operand" "xm,x")]
          UNSPEC_MOVU))]
   "TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
-  "%vmovu<ssemodesuffix>\t{%1, %0|%0, %1}"
+{
+  switch (get_attr_mode (insn))
+    {
+    case MODE_V8SF:
+    case MODE_V4SF:
+      return "%vmovups\t{%1, %0|%0, %1}";
+    default:
+      return "%vmovu<ssemodesuffix>\t{%1, %0|%0, %1}";
+    }
+}
   [(set_attr "type" "ssemov")
    (set_attr "movu" "1")
    (set_attr "prefix" "maybe_vex")
-   (set_attr "mode" "<MODE>")])
+   (set (attr "mode")
+       (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+                (if_then_else
+                   (match_test "GET_MODE_SIZE (<MODE>mode) > 16")
+                   (const_string "V8SF")
+                   (const_string "V4SF"))
+              (match_test "TARGET_AVX")
+                (const_string "<MODE>")
+              (ior (match_test "optimize_function_for_size_p (cfun)")
+                   (and (eq_attr "alternative" "1")
+                        (match_test "TARGET_SSE_TYPELESS_STORES")))
+                (const_string "V4SF")
+             ]
+       (const_string "<MODE>")))])
 
 (define_expand "<sse2>_movdqu<avxsizesuffix>"
   [(set (match_operand:VI1 0 "nonimmediate_operand")
@@ -618,7 +638,16 @@
        (unspec:VI1 [(match_operand:VI1 1 "nonimmediate_operand" "xm,x")]
                    UNSPEC_MOVU))]
   "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
-  "%vmovdqu\t{%1, %0|%0, %1}"
+{
+  switch (get_attr_mode (insn))
+    {
+    case MODE_V8SF:
+    case MODE_V4SF:
+      return "%vmovups\t{%1, %0|%0, %1}";
+    default:
+      return "%vmovdqu\t{%1, %0|%0, %1}";
+    }
+}
   [(set_attr "type" "ssemov")
    (set_attr "movu" "1")
    (set (attr "prefix_data16")
@@ -627,7 +656,20 @@
      (const_string "*")
      (const_string "1")))
    (set_attr "prefix" "maybe_vex")
-   (set_attr "mode" "<sseinsnmode>")])
+   (set (attr "mode")
+       (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+                (if_then_else
+                   (match_test "GET_MODE_SIZE (<MODE>mode) > 16")
+                   (const_string "V8SF")
+                   (const_string "V4SF"))
+              (match_test "TARGET_AVX")
+                (const_string "<sseinsnmode>")
+              (ior (match_test "optimize_function_for_size_p (cfun)")
+                   (and (eq_attr "alternative" "1")
+                        (match_test "TARGET_SSE_TYPELESS_STORES")))
+                (const_string "V4SF")
+             ]
+       (const_string "<sseinsnmode>")))])
 
 (define_insn "<sse3>_lddqu<avxsizesuffix>"
   [(set (match_operand:VI1 0 "register_operand" "=x")
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c  (revision 187289)
+++ config/i386/i386.c  (working copy)
@@ -15907,60 +15907,19 @@ ix86_expand_vector_move_misalign (enum machine_mod
   op0 = operands[0];
   op1 = operands[1];
 
-  if (TARGET_AVX)
+  if (TARGET_AVX
+      && GET_MODE_SIZE (mode) == 32)
     {
       switch (GET_MODE_CLASS (mode))
        {
        case MODE_VECTOR_INT:
        case MODE_INT:
-         switch (GET_MODE_SIZE (mode))
-           {
-           case 16:
-             if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
-               {
-                 op0 = gen_lowpart (V4SFmode, op0);
-                 op1 = gen_lowpart (V4SFmode, op1);
-                 emit_insn (gen_sse_movups (op0, op1));
-               }
-             else
-               {
-                 op0 = gen_lowpart (V16QImode, op0);
-                 op1 = gen_lowpart (V16QImode, op1);
-                 emit_insn (gen_sse2_movdqu (op0, op1));
-               }
-             break;
-           case 32:
-             op0 = gen_lowpart (V32QImode, op0);
-             op1 = gen_lowpart (V32QImode, op1);
-             ix86_avx256_split_vector_move_misalign (op0, op1);
-             break;
-           default:
-             gcc_unreachable ();
-           }
-         break;
+         op0 = gen_lowpart (V32QImode, op0);
+         op1 = gen_lowpart (V32QImode, op1);
+         /* FALLTHRU */
+
        case MODE_VECTOR_FLOAT:
-         switch (mode)
-           {
-           case V4SFmode:
-             emit_insn (gen_sse_movups (op0, op1));
-             break;
-           case V2DFmode:
-             if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
-               {
-                 op0 = gen_lowpart (V4SFmode, op0);
-                 op1 = gen_lowpart (V4SFmode, op1);
-                 emit_insn (gen_sse_movups (op0, op1));
-               }
-             else
-               emit_insn (gen_sse2_movupd (op0, op1));
-             break;
-           case V8SFmode:
-           case V4DFmode:
-             ix86_avx256_split_vector_move_misalign (op0, op1);
-             break;
-           default:
-             gcc_unreachable ();
-           }
+         ix86_avx256_split_vector_move_misalign (op0, op1);
          break;
 
        default:
@@ -15972,16 +15931,6 @@ ix86_expand_vector_move_misalign (enum machine_mod
 
   if (MEM_P (op1))
     {
-      /* If we're optimizing for size, movups is the smallest.  */
-      if (optimize_insn_for_size_p ()
-         || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
-       {
-         op0 = gen_lowpart (V4SFmode, op0);
-         op1 = gen_lowpart (V4SFmode, op1);
-         emit_insn (gen_sse_movups (op0, op1));
-         return;
-       }
-
       /* ??? If we have typed data, then it would appear that using
         movdqu is the only way to get unaligned data loaded with
         integer type.  */
@@ -15989,16 +15938,19 @@ ix86_expand_vector_move_misalign (enum machine_mod
        {
          op0 = gen_lowpart (V16QImode, op0);
          op1 = gen_lowpart (V16QImode, op1);
+         /* We will eventually emit movups based on insn attributes.  */
          emit_insn (gen_sse2_movdqu (op0, op1));
-         return;
        }
-
-      if (TARGET_SSE2 && mode == V2DFmode)
+      else if (TARGET_SSE2 && mode == V2DFmode)
         {
           rtx zero;
 
-         if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
+         if (TARGET_AVX
+             || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+             || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+             || optimize_function_for_size_p (cfun))
            {
+             /* We will eventually emit movups based on insn attributes.  */
              emit_insn (gen_sse2_movupd (op0, op1));
              return;
            }
@@ -16030,7 +15982,10 @@ ix86_expand_vector_move_misalign (enum machine_mod
        }
       else
         {
-         if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
+         if (TARGET_AVX
+             || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+             || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+             || optimize_function_for_size_p (cfun))
            {
              op0 = gen_lowpart (V4SFmode, op0);
              op1 = gen_lowpart (V4SFmode, op1);
@@ -16045,6 +16000,7 @@ ix86_expand_vector_move_misalign (enum machine_mod
 
          if (mode != V4SFmode)
            op0 = gen_lowpart (V4SFmode, op0);
+
          m = adjust_address (op1, V2SFmode, 0);
          emit_insn (gen_sse_loadlps (op0, op0, m));
          m = adjust_address (op1, V2SFmode, 8);
@@ -16053,30 +16009,20 @@ ix86_expand_vector_move_misalign (enum machine_mod
     }
   else if (MEM_P (op0))
     {
-      /* If we're optimizing for size, movups is the smallest.  */
-      if (optimize_insn_for_size_p ()
-         || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
-       {
-         op0 = gen_lowpart (V4SFmode, op0);
-         op1 = gen_lowpart (V4SFmode, op1);
-         emit_insn (gen_sse_movups (op0, op1));
-         return;
-       }
-
-      /* ??? Similar to above, only less clear
-        because of typeless stores.  */
-      if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
-         && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+      if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
         {
          op0 = gen_lowpart (V16QImode, op0);
          op1 = gen_lowpart (V16QImode, op1);
+         /* We will eventually emit movups based on insn attributes.  */
          emit_insn (gen_sse2_movdqu (op0, op1));
-         return;
        }
-
-      if (TARGET_SSE2 && mode == V2DFmode)
+      else if (TARGET_SSE2 && mode == V2DFmode)
        {
-         if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
+         if (TARGET_AVX
+             || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
+             || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+             || optimize_function_for_size_p (cfun))
+           /* We will eventually emit movups based on insn attributes.  */
            emit_insn (gen_sse2_movupd (op0, op1));
          else
            {
@@ -16091,7 +16037,10 @@ ix86_expand_vector_move_misalign (enum machine_mod
          if (mode != V4SFmode)
            op1 = gen_lowpart (V4SFmode, op1);
 
-         if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
+         if (TARGET_AVX
+             || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
+             || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+             || optimize_function_for_size_p (cfun))
            {
              op0 = gen_lowpart (V4SFmode, op0);
              emit_insn (gen_sse_movups (op0, op1));

Reply via email to