On core2, unaligned vector load/store using movdqu is a very slow operation. Experiments show it is six times slower than movdqa (aligned) and this is irrespective of whether the resulting data happens to be aligned or not. For Corei7, there is no performance difference between the two and on AMDs, movdqu is only about 10% slower.
This patch does not vectorize loops that need to generate the slow unaligned memory load/stores on core2. Do not vectorize loops on Core2 that need to use unaligned vector load/stores. * tree-vect-stmts.c (is_slow_vect_unaligned_load_store): New function. (vect_analyze_stmt): Check if the vectorizable load/store is slow. * target.def (TARGET_SLOW_UNALIGNED_VECTOR_MEMOP): New target hook. * doc/m.texi.in: Document new target hook: TARGET_SLOW_UNALIGNED_VECTOR_MEMOP * doc/m.texi: Regenerate. * config/i386/i386.c (ix86_slow_unaligned_vector_memop): New function. (TARGET_SLOW_UNALIGNED_VECTOR_MEMOP): New macro. Index: doc/tm.texi =================================================================== --- doc/tm.texi (revision 182265) +++ doc/tm.texi (working copy) @@ -10984,6 +10984,11 @@ The result is another tree containing a simplified call's result. If @var{ignore} is true the value will be ignored. @end deftypefn +@deftypefn {Target Hook} bool TARGET_SLOW_UNALIGNED_VECTOR_MEMOP (void) +Return true if unaligned vector memory load/store is a slow operation +on this target. +@end deftypefn + @deftypefn {Target Hook} {const char *} TARGET_INVALID_WITHIN_DOLOOP (const_rtx @var{insn}) Take an instruction in @var{insn} and return NULL if it is valid within a Index: doc/tm.texi.in =================================================================== --- doc/tm.texi.in (revision 182265) +++ doc/tm.texi.in (working copy) @@ -10875,6 +10875,11 @@ The result is another tree containing a simplified call's result. If @var{ignore} is true the value will be ignored. @end deftypefn +@hook TARGET_SLOW_UNALIGNED_VECTOR_MEMOP +Return true if unaligned vector memory load/store is a slow operation +on this target. +@end deftypefn + @hook TARGET_INVALID_WITHIN_DOLOOP Take an instruction in @var{insn} and return NULL if it is valid within a Index: target.def =================================================================== --- target.def (revision 182265) +++ target.def (working copy) @@ -1221,6 +1221,12 @@ DEFHOOK tree, (tree fndecl, int n_args, tree *argp, bool ignore), hook_tree_tree_int_treep_bool_null) +/* Returns true if unaligned vector loads/stores are slow. */ +DEFHOOK +(slow_unaligned_vector_memop, + "", + bool, (void), NULL) + /* Returns a code for a target-specific builtin that implements reciprocal of the function, or NULL_TREE if not available. */ DEFHOOK Index: tree-vect-stmts.c =================================================================== --- tree-vect-stmts.c (revision 182265) +++ tree-vect-stmts.c (working copy) @@ -4905,7 +4905,54 @@ vectorizable_condition (gimple stmt, gimple_stmt_i return true; } +/* Returns true if the vector load/store is unaligned and if + unaligned vector load/stores are slow. */ +static bool +is_slow_vect_unaligned_load_store (gimple stmt) +{ + stmt_vec_info stmt_info; + struct data_reference *dr = NULL; + + /* Are unaligned load/stores slow for this target? */ + if (!targetm.slow_unaligned_vector_memop + || !targetm.slow_unaligned_vector_memop ()) + return false; + + /* Harmful only if it is in a hot region of code when profiles are + available. */ + if (profile_status == PROFILE_READ + && !maybe_hot_bb_p (gimple_bb (stmt))) + return false; + + stmt_info = vinfo_for_stmt (stmt); + if (!stmt_info) + return false; + + /* Check if access is aligned?. */ + if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) + { + gimple first_stmt = GROUP_FIRST_ELEMENT (stmt_info); + if (first_stmt + && vinfo_for_stmt (first_stmt)) + dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); + } + else + { + dr = STMT_VINFO_DATA_REF (stmt_info); + } + + if (!dr) + return false; + + if (!aligned_access_p (dr)) + { + return true; + } + + return false; +} + /* Make sure the statement is vectorizable. */ bool @@ -5065,27 +5112,43 @@ vect_analyze_stmt (gimple stmt, bool *need_to_vect if (!bb_vinfo && (STMT_VINFO_RELEVANT_P (stmt_info) || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)) + { ok = (vectorizable_type_promotion (stmt, NULL, NULL, NULL) || vectorizable_type_demotion (stmt, NULL, NULL, NULL) || vectorizable_conversion (stmt, NULL, NULL, NULL) || vectorizable_shift (stmt, NULL, NULL, NULL) || vectorizable_operation (stmt, NULL, NULL, NULL) || vectorizable_assignment (stmt, NULL, NULL, NULL) - || vectorizable_load (stmt, NULL, NULL, NULL, NULL) || vectorizable_call (stmt, NULL, NULL) - || vectorizable_store (stmt, NULL, NULL, NULL) - || vectorizable_reduction (stmt, NULL, NULL, NULL) + || vectorizable_reduction (stmt, NULL, NULL, NULL) || vectorizable_condition (stmt, NULL, NULL, NULL, 0)); + + if (!ok) + { + ok = (vectorizable_load (stmt, NULL, NULL, NULL, NULL) + || vectorizable_store (stmt, NULL, NULL, NULL)); + + if (ok && is_slow_vect_unaligned_load_store (stmt)) + ok = false; + } + } else { if (bb_vinfo) - ok = (vectorizable_type_promotion (stmt, NULL, NULL, node) - || vectorizable_type_demotion (stmt, NULL, NULL, node) - || vectorizable_shift (stmt, NULL, NULL, node) - || vectorizable_operation (stmt, NULL, NULL, node) - || vectorizable_assignment (stmt, NULL, NULL, node) - || vectorizable_load (stmt, NULL, NULL, node, NULL) - || vectorizable_store (stmt, NULL, NULL, node)); + { + ok = (vectorizable_type_promotion (stmt, NULL, NULL, node) + || vectorizable_type_demotion (stmt, NULL, NULL, node) + || vectorizable_shift (stmt, NULL, NULL, node) + || vectorizable_operation (stmt, NULL, NULL, node) + || vectorizable_assignment (stmt, NULL, NULL, node)); + if (!ok) + { + ok = (vectorizable_load (stmt, NULL, NULL, node, NULL) + || vectorizable_store (stmt, NULL, NULL, node)); + if (ok && is_slow_vect_unaligned_load_store (stmt)) + ok = false; + } + } } if (!ok) Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 182265) +++ config/i386/i386.c (working copy) @@ -26464,6 +26464,24 @@ ix86_init_mmx_sse_builtins (void) } } +/* Detect if this unaligned vectorizable load/stores should be + considered slow. This is true for core2 where the movdqu insn + is slow, ~5x slower than the movdqa. */ + +static bool +ix86_slow_unaligned_vector_memop (void) +{ + /* This is known to be slow on core2. */ + if (ix86_tune == PROCESSOR_CORE2_64 + || ix86_tune == PROCESSOR_CORE2_32) + return true; + + return false; +} + /* Internal method for ix86_init_builtins. */ static void @@ -36624,6 +36642,9 @@ ix86_loop_unroll_adjust (unsigned nunroll, struct #undef TARGET_BUILD_BUILTIN_VA_LIST #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list +#undef TARGET_SLOW_UNALIGNED_VECTOR_MEMOP +#define TARGET_SLOW_UNALIGNED_VECTOR_MEMOP ix86_slow_unaligned_vector_memop + #undef TARGET_ENUM_VA_LIST_P #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list -- This patch is available for review at http://codereview.appspot.com/5488054