Hi Richard, As per your suggestion I tried to fix the PR by splitting the SLP store group at vector boundary after the SLP tree is built.
Boot strap PASSED on x86_64. Checked the patch with check_GNU_style.sh. The gfortran.dg/pr46519-1.f test now does SLP vectorization. Hence it generated 2 more vzeroupper. As recommended I adjusted the test case by adding -fno-tree-slp-vectorize to make it as expected after loop vectorization. The following tests are now passing. ------ Snip----- Tests that now work, but didn't before: gcc.dg/vect/bb-slp-19.c -flto -ffat-lto-objects scan-tree-dump-times slp2 "basic block vectorized" 1 gcc.dg/vect/bb-slp-19.c scan-tree-dump-times slp2 "basic block vectorized" 1 New tests that PASS: gcc.dg/vect/pr58135.c (test for excess errors) gcc.dg/vect/pr58135.c -flto -ffat-lto-objects (test for excess errors) ------ Snip----- ChangeLog 2016-05-14 Venkataramanan Kumar <venkataramanan.ku...@amd.com> PR tree-optimization/58135 * tree-vect-slp.c: When group size is not multiple of vector size, allow splitting of store group at vector boundary. Test suite ChangeLog 2016-05-14 Venkataramanan Kumar <venkataramanan.ku...@amd.com> * gcc.dg/vect/bb-slp-19.c: Remove XFAIL. * gcc.dg/vect/pr58135.c: Add new. * gfortran.dg/pr46519-1.f: Adjust test case. The attached patch Ok for trunk? Regards, Venkat.
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-19.c b/gcc/testsuite/gcc.dg/vect/bb-slp-19.c index 42cd294..c282155 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-19.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-19.c @@ -53,5 +53,5 @@ int main (void) return 0; } -/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/pr58135.c b/gcc/testsuite/gcc.dg/vect/pr58135.c new file mode 100644 index 0000000..ca25000 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr58135.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ + +int a[100]; +void foo () +{ + a[0] = a[1] = a[2] = a[3] = a[4]= 0; +} + +/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */ diff --git a/gcc/testsuite/gfortran.dg/pr46519-1.f b/gcc/testsuite/gfortran.dg/pr46519-1.f index 51c64b8..46be9f5 100644 --- a/gcc/testsuite/gfortran.dg/pr46519-1.f +++ b/gcc/testsuite/gfortran.dg/pr46519-1.f @@ -1,5 +1,5 @@ ! { dg-do compile { target i?86-*-* x86_64-*-* } } -! { dg-options "-O3 -mavx -mvzeroupper -mtune=generic -dp" } +! { dg-options "-O3 -mavx -mvzeroupper -fno-tree-slp-vectorize -mtune=generic -dp" } PROGRAM MG3XDEMO INTEGER LM, NM, NV, NR, NIT diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index d713848..23a127f 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -1754,18 +1754,6 @@ vect_analyze_slp_instance (vec_info *vinfo, } nunits = TYPE_VECTOR_SUBPARTS (vectype); - /* Calculate the unrolling factor. */ - unrolling_factor = least_common_multiple (nunits, group_size) / group_size; - if (unrolling_factor != 1 && is_a <bb_vec_info> (vinfo)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Build SLP failed: unrolling required in basic" - " block SLP\n"); - - return false; - } - /* Create a node (a root of the SLP tree) for the packed grouped stores. */ scalar_stmts.create (group_size); next = stmt; @@ -1801,126 +1789,151 @@ vect_analyze_slp_instance (vec_info *vinfo, /* Build the tree for the SLP instance. */ bool *matches = XALLOCAVEC (bool, group_size); unsigned npermutes = 0; - if ((node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, - &max_nunits, &loads, matches, &npermutes, - NULL, max_tree_size)) != NULL) - { - /* Calculate the unrolling factor based on the smallest type. */ - if (max_nunits > nunits) - unrolling_factor = least_common_multiple (max_nunits, group_size) - / group_size; - if (unrolling_factor != 1 && is_a <bb_vec_info> (vinfo)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Build SLP failed: unrolling required in basic" - " block SLP\n"); - vect_free_slp_tree (node); - loads.release (); - return false; - } + node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, + &max_nunits, &loads, matches, &npermutes, + NULL, max_tree_size); - /* Create a new SLP instance. */ - new_instance = XNEW (struct _slp_instance); - SLP_INSTANCE_TREE (new_instance) = node; - SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size; - SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; - SLP_INSTANCE_LOADS (new_instance) = loads; - - /* Compute the load permutation. */ - slp_tree load_node; - bool loads_permuted = false; - FOR_EACH_VEC_ELT (loads, i, load_node) + if (node != NULL) + { + /* Calculate the unrolling factor. */ + unrolling_factor = least_common_multiple + (nunits, group_size) / group_size; + + if (is_a <bb_vec_info> (vinfo) + && nunits < group_size + && unrolling_factor != 1 + && is_a <bb_vec_info> (vinfo)) { - vec<unsigned> load_permutation; - int j; - gimple *load, *first_stmt; - bool this_load_permuted = false; - load_permutation.create (group_size); - first_stmt = GROUP_FIRST_ELEMENT - (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0])); - FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load) - { - int load_place - = vect_get_place_in_interleaving_chain (load, first_stmt); - gcc_assert (load_place != -1); - if (load_place != j) - this_load_permuted = true; - load_permutation.safe_push (load_place); - } - if (!this_load_permuted - /* The load requires permutation when unrolling exposes - a gap either because the group is larger than the SLP - group-size or because there is a gap between the groups. */ - && (unrolling_factor == 1 - || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt)) - && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Build SLP failed: store group " + "size not a multiple of the vector size " + "in basic block SLP\n"); + /* Fatal mismatch. */ + matches[nunits] = false; + } + else + { + /* Calculate the unrolling factor based on the smallest type. */ + if (max_nunits > nunits) + unrolling_factor + = least_common_multiple (max_nunits, group_size)/group_size; + + if (unrolling_factor != 1 && is_a <bb_vec_info> (vinfo)) { - load_permutation.release (); - continue; + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, + vect_location, + "Build SLP failed: unrolling " + "required in basic block SLP\n"); + vect_free_slp_tree (node); + loads.release (); + return false; } - SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation; - loads_permuted = true; - } - if (loads_permuted) - { - if (!vect_supported_load_permutation_p (new_instance)) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Build SLP failed: unsupported load " - "permutation "); - dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); - dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); - } - vect_free_slp_instance (new_instance); - return false; - } - } + /* Create a new SLP instance. */ + new_instance = XNEW (struct _slp_instance); + SLP_INSTANCE_TREE (new_instance) = node; + SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size; + SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; + SLP_INSTANCE_LOADS (new_instance) = loads; - /* If the loads and stores can be handled with load/store-lane - instructions do not generate this SLP instance. */ - if (is_a <loop_vec_info> (vinfo) - && loads_permuted - && dr && vect_store_lanes_supported (vectype, group_size)) - { + /* Compute the load permutation. */ slp_tree load_node; + bool loads_permuted = false; FOR_EACH_VEC_ELT (loads, i, load_node) { - gimple *first_stmt = GROUP_FIRST_ELEMENT - (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0])); - stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt); - /* Use SLP for strided accesses (or if we can't load-lanes). */ - if (STMT_VINFO_STRIDED_P (stmt_vinfo) - || ! vect_load_lanes_supported - (STMT_VINFO_VECTYPE (stmt_vinfo), - GROUP_SIZE (stmt_vinfo))) - break; + vec<unsigned> load_permutation; + int j; + gimple *load, *first_stmt; + bool this_load_permuted = false; + load_permutation.create (group_size); + first_stmt = GROUP_FIRST_ELEMENT + (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0])); + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load) + { + int load_place = vect_get_place_in_interleaving_chain + (load, first_stmt); + gcc_assert (load_place != -1); + if (load_place != j) + this_load_permuted = true; + load_permutation.safe_push (load_place); + } + if (!this_load_permuted + /* The load requires permutation when unrolling exposes + a gap either because the group is larger than the SLP + group-size or because there is a gap between the groups. */ + && (unrolling_factor == 1 + || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt)) + && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))) + { + load_permutation.release (); + continue; + } + SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation; + loads_permuted = true; } - if (i == loads.length ()) + + if (loads_permuted) { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Built SLP cancelled: can use " - "load/store-lanes\n"); - vect_free_slp_instance (new_instance); - return false; + if (!vect_supported_load_permutation_p (new_instance)) + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Build SLP failed: unsupported load " + "permutation "); + dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, + TDF_SLIM, stmt, 0); + dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); + } + vect_free_slp_instance (new_instance); + return false; + } + } + + /* If the loads and stores can be handled with load/store-lan + instructions do not generate this SLP instance. */ + if (is_a <loop_vec_info> (vinfo) + && loads_permuted + && dr && vect_store_lanes_supported (vectype, group_size)) + { + slp_tree load_node; + FOR_EACH_VEC_ELT (loads, i, load_node) + { + gimple *first_stmt = GROUP_FIRST_ELEMENT + (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0])); + stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt); + /* Use SLP for strided accesses (or if we + can't load-lanes). */ + if (STMT_VINFO_STRIDED_P (stmt_vinfo) + || ! vect_load_lanes_supported + (STMT_VINFO_VECTYPE (stmt_vinfo), + GROUP_SIZE (stmt_vinfo))) + break; + } + if (i == loads.length ()) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Built SLP cancelled: can use " + "load/store-lanes\n"); + vect_free_slp_instance (new_instance); + return false; + } } - } - vinfo->slp_instances.safe_push (new_instance); + vinfo->slp_instances.safe_push (new_instance); - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "Final SLP tree for instance:\n"); - vect_print_slp_tree (MSG_NOTE, vect_location, node); - } + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Final SLP tree for instance:\n"); + vect_print_slp_tree (MSG_NOTE, vect_location, node); + } - return true; + return true; + } } /* Failed to SLP. */