The following fixes a miscompilation by the recent vector load optimization. The check matching unused upper half of a vector was incomplete.
Bootstrap / regtest running on x86_64-unknown-linux-gnu, applied to trunk. Richard. 2019-05-06 Richard Biener <rguent...@suse.de> PR tree-optimization/90358 * tree-vect-stmts.c (get_group_load_store_type): Properly detect unused upper half of load. (vectorizable_load): Likewise. * gcc.target/i386/pr90358.c: New testcase. Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c (revision 270902) +++ gcc/tree-vect-stmts.c (working copy) @@ -2273,6 +2274,7 @@ get_group_load_store_type (stmt_vec_info == dr_aligned || alignment_support_scheme == dr_unaligned_supported) && known_eq (nunits, (group_size - gap) * 2) + && known_eq (nunits, group_size) && mode_for_vector (elmode, (group_size - gap)).exists (&vmode) && VECTOR_MODE_P (vmode) && targetm.vector_mode_supported_p (vmode) @@ -8550,7 +8552,8 @@ vectorizable_load (stmt_vec_info stmt_in && DR_GROUP_GAP (first_stmt_info) != 0 && known_eq (nunits, (group_size - - DR_GROUP_GAP (first_stmt_info)) * 2)) + - DR_GROUP_GAP (first_stmt_info)) * 2) + && known_eq (nunits, group_size)) ltype = build_vector_type (TREE_TYPE (vectype), (group_size - DR_GROUP_GAP Index: gcc/testsuite/gcc.target/i386/pr90358.c =================================================================== --- gcc/testsuite/gcc.target/i386/pr90358.c (nonexistent) +++ gcc/testsuite/gcc.target/i386/pr90358.c (working copy) @@ -0,0 +1,35 @@ +/* PR target/90358 */ +/* { dg-do run { target { sse4_runtime } } } */ +/* { dg-options "-O3 -msse4" } */ + +struct s { unsigned int a, b, c; }; + +void __attribute__ ((noipa)) +foo (struct s *restrict s1, struct s *restrict s2, int n) +{ + for (int i = 0; i < n; ++i) + { + s1[i].b = s2[i].b; + s1[i].c = s2[i].c; + s2[i].c = 0; + } +} + +#define N 12 + +int +main () +{ + struct s s1[N], s2[N]; + for (unsigned int j = 0; j < N; ++j) + { + s2[j].a = j * 5; + s2[j].b = j * 5 + 2; + s2[j].c = j * 5 + 4; + } + foo (s1, s2, N); + for (unsigned int j = 0; j < N; ++j) + if (s1[j].b != j * 5 + 2) + __builtin_abort (); + return 0; +}