[PATCH i386][google]With -mtune=core2, avoid generating the slow unaligned vector load/store (issue5488054)

Sriraman Tallam Mon, 12 Dec 2011 18:06:21 -0800

On core2, unaligned vector load/store using movdqu is a very slow operation.
Experiments show it is six times slower than movdqa (aligned) and this is
irrespective of whether the resulting data happens to be aligned or not. 
For Corei7, there is no performance difference between the two and on AMDs,
movdqu is only about 10% slower.


This patch does not vectorize loops that need to generate the slow unaligned
memory load/stores on core2.


        Do not vectorize loops on Core2 that need to use unaligned
        vector load/stores.
        * tree-vect-stmts.c (is_slow_vect_unaligned_load_store): New function.
        (vect_analyze_stmt): Check if the vectorizable load/store is slow.
        * target.def (TARGET_SLOW_UNALIGNED_VECTOR_MEMOP): New target hook.
        * doc/m.texi.in: Document new target hook:
        TARGET_SLOW_UNALIGNED_VECTOR_MEMOP
        * doc/m.texi: Regenerate.
        * config/i386/i386.c (ix86_slow_unaligned_vector_memop): New function.
        (TARGET_SLOW_UNALIGNED_VECTOR_MEMOP): New macro.

Index: doc/tm.texi
===================================================================
--- doc/tm.texi (revision 182265)
+++ doc/tm.texi (working copy)
@@ -10984,6 +10984,11 @@ The result is another tree containing a simplified
 call's result.  If @var{ignore} is true the value will be ignored.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_SLOW_UNALIGNED_VECTOR_MEMOP (void)
+Return true if unaligned vector memory load/store is a slow operation
+on this target.
+@end deftypefn
+
 @deftypefn {Target Hook} {const char *} TARGET_INVALID_WITHIN_DOLOOP 
(const_rtx @var{insn})
 
 Take an instruction in @var{insn} and return NULL if it is valid within a
Index: doc/tm.texi.in
===================================================================
--- doc/tm.texi.in      (revision 182265)
+++ doc/tm.texi.in      (working copy)
@@ -10875,6 +10875,11 @@ The result is another tree containing a simplified
 call's result.  If @var{ignore} is true the value will be ignored.
 @end deftypefn
 
+@hook TARGET_SLOW_UNALIGNED_VECTOR_MEMOP
+Return true if unaligned vector memory load/store is a slow operation
+on this target.
+@end deftypefn
+
 @hook TARGET_INVALID_WITHIN_DOLOOP
 
 Take an instruction in @var{insn} and return NULL if it is valid within a
Index: target.def
===================================================================
--- target.def  (revision 182265)
+++ target.def  (working copy)
@@ -1221,6 +1221,12 @@ DEFHOOK
  tree, (tree fndecl, int n_args, tree *argp, bool ignore),
  hook_tree_tree_int_treep_bool_null)
 
+/* Returns true if unaligned vector loads/stores are slow.  */
+DEFHOOK
+(slow_unaligned_vector_memop,
+ "",
+ bool, (void), NULL)
+
 /* Returns a code for a target-specific builtin that implements
    reciprocal of the function, or NULL_TREE if not available.  */
 DEFHOOK
Index: tree-vect-stmts.c
===================================================================
--- tree-vect-stmts.c   (revision 182265)
+++ tree-vect-stmts.c   (working copy)
@@ -4905,7 +4905,54 @@ vectorizable_condition (gimple stmt, gimple_stmt_i
   return true;
 }
 
+/* Returns true if the vector load/store is unaligned and if
+   unaligned vector load/stores are slow.  */
 
+static bool
+is_slow_vect_unaligned_load_store (gimple stmt)
+{
+  stmt_vec_info stmt_info;
+  struct data_reference *dr = NULL;
+
+  /* Are unaligned load/stores slow for this target?  */
+  if (!targetm.slow_unaligned_vector_memop
+      || !targetm.slow_unaligned_vector_memop ())
+    return false;
+
+  /* Harmful only if it is in a hot region of code when profiles are
+     available.  */
+  if (profile_status == PROFILE_READ
+      && !maybe_hot_bb_p (gimple_bb (stmt)))
+    return false;
+  
+  stmt_info = vinfo_for_stmt (stmt);
+  if (!stmt_info)
+    return false;
+ 
+  /* Check if access is aligned?.  */
+  if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
+    {
+      gimple first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
+      if (first_stmt
+         && vinfo_for_stmt (first_stmt))
+        dr =  STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
+    }
+  else
+    {
+      dr = STMT_VINFO_DATA_REF (stmt_info);
+    }
+ 
+  if (!dr)
+    return false;
+ 
+  if (!aligned_access_p (dr))
+   {
+     return true;
+   }
+    
+  return false;  
+}
+
 /* Make sure the statement is vectorizable.  */
 
 bool
@@ -5065,27 +5112,43 @@ vect_analyze_stmt (gimple stmt, bool *need_to_vect
    if (!bb_vinfo
        && (STMT_VINFO_RELEVANT_P (stmt_info)
            || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def))
+    {
       ok = (vectorizable_type_promotion (stmt, NULL, NULL, NULL)
             || vectorizable_type_demotion (stmt, NULL, NULL, NULL)
             || vectorizable_conversion (stmt, NULL, NULL, NULL)
             || vectorizable_shift (stmt, NULL, NULL, NULL)
             || vectorizable_operation (stmt, NULL, NULL, NULL)
             || vectorizable_assignment (stmt, NULL, NULL, NULL)
-            || vectorizable_load (stmt, NULL, NULL, NULL, NULL)
             || vectorizable_call (stmt, NULL, NULL)
-            || vectorizable_store (stmt, NULL, NULL, NULL)
-            || vectorizable_reduction (stmt, NULL, NULL, NULL)
+           || vectorizable_reduction (stmt, NULL, NULL, NULL)
             || vectorizable_condition (stmt, NULL, NULL, NULL, 0));
+
+      if (!ok)
+       {
+          ok = (vectorizable_load (stmt, NULL, NULL, NULL, NULL)
+               || vectorizable_store (stmt, NULL, NULL, NULL));
+
+         if (ok && is_slow_vect_unaligned_load_store (stmt))
+           ok = false;
+       }
+    }
     else
       {
         if (bb_vinfo)
-          ok = (vectorizable_type_promotion (stmt, NULL, NULL, node)
-                || vectorizable_type_demotion (stmt, NULL, NULL, node)
-               || vectorizable_shift (stmt, NULL, NULL, node)
-                || vectorizable_operation (stmt, NULL, NULL, node)
-                || vectorizable_assignment (stmt, NULL, NULL, node)
-                || vectorizable_load (stmt, NULL, NULL, node, NULL)
-                || vectorizable_store (stmt, NULL, NULL, node));
+         {
+           ok = (vectorizable_type_promotion (stmt, NULL, NULL, node)
+                 || vectorizable_type_demotion (stmt, NULL, NULL, node)
+                 || vectorizable_shift (stmt, NULL, NULL, node)
+                  || vectorizable_operation (stmt, NULL, NULL, node)
+                  || vectorizable_assignment (stmt, NULL, NULL, node));
+            if (!ok) 
+             {
+                ok = (vectorizable_load (stmt, NULL, NULL, node, NULL)
+                     || vectorizable_store (stmt, NULL, NULL, node));
+               if (ok && is_slow_vect_unaligned_load_store (stmt))
+                 ok = false;
+             }
+         }
       }
 
   if (!ok)
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c  (revision 182265)
+++ config/i386/i386.c  (working copy)
@@ -26464,6 +26464,24 @@ ix86_init_mmx_sse_builtins (void)
     }
 }
 
+/* Detect if this unaligned vectorizable load/stores should be
+   considered slow.  This is true for core2 where the movdqu insn
+   is slow, ~5x slower than the movdqa.  */
+
+static bool
+ix86_slow_unaligned_vector_memop (void)
+{
+  /* This is known to be slow on core2.  */
+  if (ix86_tune == PROCESSOR_CORE2_64
+      || ix86_tune == PROCESSOR_CORE2_32)
+    return true;
+
+  return false;
+}
+
 /* Internal method for ix86_init_builtins.  */
 
 static void
@@ -36624,6 +36642,9 @@ ix86_loop_unroll_adjust (unsigned nunroll, struct
 #undef TARGET_BUILD_BUILTIN_VA_LIST
 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
 
+#undef TARGET_SLOW_UNALIGNED_VECTOR_MEMOP
+#define TARGET_SLOW_UNALIGNED_VECTOR_MEMOP ix86_slow_unaligned_vector_memop
+
 #undef TARGET_ENUM_VA_LIST_P
 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
 

--
This patch is available for review at http://codereview.appspot.com/5488054

[PATCH i386][google]With -mtune=core2, avoid generating the slow unaligned vector load/store (issue5488054)

Reply via email to