Hi, This patch allows to vectorize a subchain of interleaved loads in basic block SLP (in loop vectorization this would be more complicated because of loop peeling). This patch also swaps operands if necessary (and possible) to make operations isomorphic.
Bootstrapped and tested on powerpc64-suse-linux. Committed. Ira ChangeLog: * tree-vect-stmts.c (vectorizable_load): For SLP without permutation treat the first load of the node as the first element in its interleaving chain. * tree-vect-slp.c (vect_get_and_check_slp_defs): Swap the operands if necessary and possible. (vect_build_slp_tree): Add new argument. Allow load groups of any size in basic blocks. Keep all the loads for further permutation check. Use the new argument to determine if there is a permutation. Update the recursive calls. (vect_supported_load_permutation_p): Allow subchains of interleaving chains in basic block vectorization. (vect_analyze_slp_instance): Update the call to vect_build_slp_tree. Check load permutation based on the new parameter. (vect_schedule_slp_instance): Don't start from the first element in interleaving chain unless the loads are permuted. testsuite/ChangeLog: * gcc.dg/vect/bb-slp-29.c: New test.
Index: ChangeLog =================================================================== --- ChangeLog (revision 180052) +++ ChangeLog (working copy) @@ -1,3 +1,21 @@ +2011-10-16 Ira Rosen <ira.ro...@linaro.org> + + * tree-vect-stmts.c (vectorizable_load): For SLP without permutation + treat the first load of the node as the first element in its + interleaving chain. + * tree-vect-slp.c (vect_get_and_check_slp_defs): Swap the operands if + necessary and possible. + (vect_build_slp_tree): Add new argument. Allow load groups of any size + in basic blocks. Keep all the loads for further permutation check. + Use the new argument to determine if there is a permutation. Update + the recursive calls. + (vect_supported_load_permutation_p): Allow subchains of interleaving + chains in basic block vectorization. + (vect_analyze_slp_instance): Update the call to vect_build_slp_tree. + Check load permutation based on the new parameter. + (vect_schedule_slp_instance): Don't start from the first element in + interleaving chain unless the loads are permuted. + 2011-10-15 Richard Henderson <r...@redhat.com> * tree-vect-slp.c: Include langhooks.h. Index: testsuite/gcc.dg/vect/bb-slp-29.c =================================================================== --- testsuite/gcc.dg/vect/bb-slp-29.c (revision 0) +++ testsuite/gcc.dg/vect/bb-slp-29.c (revision 0) @@ -0,0 +1,59 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define A 3 +#define B 4 +#define N 256 + +short src[N], dst[N]; + +void foo (short * __restrict__ dst, short * __restrict__ src, int h, int stride, int dummy) +{ + int i; + h /= 16; + for (i = 0; i < h; i++) + { + dst[0] = A*src[0] + B*src[1]; + dst[1] = A*src[1] + B*src[2]; + dst[2] = A*src[2] + B*src[3]; + dst[3] = A*src[3] + B*src[4]; + dst[4] = A*src[4] + B*src[5]; + dst[5] = A*src[5] + B*src[6]; + dst[6] = A*src[6] + B*src[7]; + dst[7] = A*src[7] + B*src[8]; + dst += stride; + src += stride; + if (dummy == 32) + abort (); + } +} + + +int main (void) +{ + int i; + + check_vect (); + + for (i = 0; i < N; i++) + { + dst[i] = 0; + src[i] = i; + } + + foo (dst, src, N, 8, 0); + + for (i = 0; i < N/2; i++) + { + if (dst[i] != A * src[i] + B * src[i+1]) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 1 "slp" { target { vect_int_mult && vect_element_align } } } } */ +/* { dg-final { cleanup-tree-dump "slp" } } */ + Index: testsuite/ChangeLog =================================================================== --- testsuite/ChangeLog (revision 180052) +++ testsuite/ChangeLog (working copy) @@ -1,3 +1,7 @@ +2011-10-16 Ira Rosen <ira.ro...@linaro.org> + + * gcc.dg/vect/bb-slp-29.c: New test. + 2011-10-15 Paolo Carlini <paolo.carl...@oracle.com> PR c++/50732 Index: tree-vect-stmts.c =================================================================== --- tree-vect-stmts.c (revision 180052) +++ tree-vect-stmts.c (working copy) @@ -4241,6 +4241,11 @@ vectorizable_load (gimple stmt, gimple_stmt_iterat if (strided_load) { first_stmt = GROUP_FIRST_ELEMENT (stmt_info); + if (slp + && !SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance) + && first_stmt != VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0)) + first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0); + /* Check if the chain of loads is already vectorized. */ if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt))) { Index: tree-vect-slp.c =================================================================== --- tree-vect-slp.c (revision 180052) +++ tree-vect-slp.c (working copy) @@ -116,13 +116,15 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vi { tree oprnd; unsigned int i, number_of_oprnds; - tree def; + tree def[2]; gimple def_stmt; enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; stmt_vec_info stmt_info = vinfo_for_stmt (VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0)); enum gimple_rhs_class rhs_class; struct loop *loop = NULL; + enum tree_code rhs_code; + bool different_types = false; if (loop_vinfo) loop = LOOP_VINFO_LOOP (loop_vinfo); @@ -134,7 +136,7 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vi { oprnd = gimple_op (stmt, i + 1); - if (!vect_is_simple_use (oprnd, loop_vinfo, bb_vinfo, &def_stmt, &def, + if (!vect_is_simple_use (oprnd, loop_vinfo, bb_vinfo, &def_stmt, &def[i], &dt[i]) || (!def_stmt && dt[i] != vect_constant_def)) { @@ -189,11 +191,11 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vi switch (gimple_code (def_stmt)) { case GIMPLE_PHI: - def = gimple_phi_result (def_stmt); + def[i] = gimple_phi_result (def_stmt); break; case GIMPLE_ASSIGN: - def = gimple_assign_lhs (def_stmt); + def[i] = gimple_assign_lhs (def_stmt); break; default: @@ -207,8 +209,8 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vi { /* op0 of the first stmt of the group - store its info. */ *first_stmt_dt0 = dt[i]; - if (def) - *first_stmt_def0_type = TREE_TYPE (def); + if (def[i]) + *first_stmt_def0_type = TREE_TYPE (def[i]); else *first_stmt_const_oprnd = oprnd; @@ -228,8 +230,8 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vi { /* op1 of the first stmt of the group - store its info. */ *first_stmt_dt1 = dt[i]; - if (def) - *first_stmt_def1_type = TREE_TYPE (def); + if (def[i]) + *first_stmt_def1_type = TREE_TYPE (def[i]); else { /* We assume that the stmt contains only one constant @@ -255,24 +257,56 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vi && ((*first_stmt_dt0 != dt[i] && !(*first_stmt_dt0 == vect_reduction_def && dt[i] == vect_internal_def)) - || (*first_stmt_def0_type && def + || (*first_stmt_def0_type && def[0] && !types_compatible_p (*first_stmt_def0_type, - TREE_TYPE (def))))) + TREE_TYPE (def[0]))))) || (i == 1 && ((*first_stmt_dt1 != dt[i] && !(*first_stmt_dt1 == vect_reduction_def && dt[i] == vect_internal_def)) - || (*first_stmt_def1_type && def + || (*first_stmt_def1_type && def[1] && !types_compatible_p (*first_stmt_def1_type, - TREE_TYPE (def))))) - || (!def + TREE_TYPE (def[1]))))) + || (!def[i] && !types_compatible_p (TREE_TYPE (*first_stmt_const_oprnd), - TREE_TYPE (oprnd)))) + TREE_TYPE (oprnd))) + || different_types) { - if (vect_print_dump_info (REPORT_SLP)) - fprintf (vect_dump, "Build SLP failed: different types "); + if (i != number_of_oprnds - 1) + different_types = true; + else + { + if (is_gimple_assign (stmt) + && (rhs_code = gimple_assign_rhs_code (stmt)) + && TREE_CODE_CLASS (rhs_code) == tcc_binary + && commutative_tree_code (rhs_code) + && *first_stmt_dt0 == dt[1] + && *first_stmt_dt1 == dt[0] + && def[0] && def[1] + && !(*first_stmt_def0_type + && !types_compatible_p (*first_stmt_def0_type, + TREE_TYPE (def[1]))) + && !(*first_stmt_def1_type + && !types_compatible_p (*first_stmt_def1_type, + TREE_TYPE (def[0])))) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Swapping operands of "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } - return false; + swap_tree_operands (stmt, gimple_assign_rhs1_ptr (stmt), + gimple_assign_rhs2_ptr (stmt)); + } + else + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Build SLP failed: different types "); + + return false; + } + } } } } @@ -286,10 +320,10 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vi case vect_internal_def: case vect_reduction_def: - if (i == 0) + if ((i == 0 && !different_types) || (i == 1 && different_types)) VEC_safe_push (gimple, heap, *def_stmts0, def_stmt); else - VEC_safe_push (gimple, heap, *def_stmts1, def_stmt); + VEC_safe_push (gimple, heap, *def_stmts1, def_stmt); break; default: @@ -297,7 +331,7 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vi if (vect_print_dump_info (REPORT_SLP)) { fprintf (vect_dump, "Build SLP failed: illegal type of def "); - print_generic_expr (vect_dump, def, TDF_SLIM); + print_generic_expr (vect_dump, def[i], TDF_SLIM); } return false; @@ -320,7 +354,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_ int ncopies_for_cost, unsigned int *max_nunits, VEC (int, heap) **load_permutation, VEC (slp_tree, heap) **loads, - unsigned int vectorization_factor) + unsigned int vectorization_factor, bool *loads_permuted) { VEC (gimple, heap) *def_stmts0 = VEC_alloc (gimple, heap, group_size); VEC (gimple, heap) *def_stmts1 = VEC_alloc (gimple, heap, group_size); @@ -531,7 +565,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_ /* Check that the size of interleaved loads group is not greater than the SLP group size. */ - if (GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size) + if (loop_vinfo + && GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size) { if (vect_print_dump_info (REPORT_SLP)) { @@ -652,19 +687,22 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_ /* Strided loads were reached - stop the recursion. */ if (stop_recursion) { + VEC_safe_push (slp_tree, heap, *loads, *node); if (permutation) { - VEC_safe_push (slp_tree, heap, *loads, *node); + + *loads_permuted = true; *inside_cost += targetm.vectorize.builtin_vectorization_cost (vec_perm, NULL, 0) * group_size; } else - { - /* We don't check here complex numbers chains, so we keep them in - LOADS for further check in vect_supported_load_permutation_p. */ + { + /* We don't check here complex numbers chains, so we set + LOADS_PERMUTED for further check in + vect_supported_load_permutation_p. */ if (rhs_code == REALPART_EXPR || rhs_code == IMAGPART_EXPR) - VEC_safe_push (slp_tree, heap, *loads, *node); + *loads_permuted = true; } return true; @@ -683,7 +721,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_ if (!vect_build_slp_tree (loop_vinfo, bb_vinfo, &left_node, group_size, inside_cost, outside_cost, ncopies_for_cost, max_nunits, load_permutation, loads, - vectorization_factor)) + vectorization_factor, loads_permuted)) return false; SLP_TREE_LEFT (*node) = left_node; @@ -701,7 +739,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_ if (!vect_build_slp_tree (loop_vinfo, bb_vinfo, &right_node, group_size, inside_cost, outside_cost, ncopies_for_cost, max_nunits, load_permutation, loads, - vectorization_factor)) + vectorization_factor, loads_permuted)) return false; SLP_TREE_RIGHT (*node) = right_node; @@ -887,8 +925,10 @@ vect_supported_load_permutation_p (slp_instance sl bool supported, bad_permutation = false; sbitmap load_index; slp_tree node, other_complex_node; - gimple stmt, first = NULL, other_node_first; + gimple stmt, first = NULL, other_node_first, load, next_load, first_load; unsigned complex_numbers = 0; + struct data_reference *dr; + bb_vec_info bb_vinfo; /* FORNOW: permutations are only supported in SLP. */ if (!slp_instn) @@ -1050,6 +1090,76 @@ vect_supported_load_permutation_p (slp_instance sl } } + /* In basic block vectorization we allow any subchain of an interleaving + chain. + FORNOW: not supported in loop SLP because of realignment compications. */ + bb_vinfo = STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)); + bad_permutation = false; + /* Check that for every node in the instance teh loads form a subchain. */ + if (bb_vinfo) + { + FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node) + { + next_load = NULL; + first_load = NULL; + FOR_EACH_VEC_ELT (gimple, SLP_TREE_SCALAR_STMTS (node), j, load) + { + if (!first_load) + first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (load)); + else if (first_load + != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load))) + { + bad_permutation = true; + break; + } + + if (j != 0 && next_load != load) + { + bad_permutation = true; + break; + } + + next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load)); + } + + if (bad_permutation) + break; + } + + /* Check that the alignment of the first load in every subchain, i.e., + the first statement in every load node, is supported. */ + if (!bad_permutation) + { + FOR_EACH_VEC_ELT (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node) + { + first_load = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0); + if (first_load + != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load))) + { + dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load)); + if (vect_supportable_dr_alignment (dr, false) + == dr_unaligned_unsupported) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "unsupported unaligned load "); + print_gimple_stmt (vect_dump, first_load, 0, + TDF_SLIM); + } + bad_permutation = true; + break; + } + } + } + + if (!bad_permutation) + { + VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (slp_instn)); + return true; + } + } + } + /* FORNOW: the only supported permutation is 0..01..1.. of length equal to GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as well (unless it's reduction). */ @@ -1159,6 +1269,7 @@ vect_analyze_slp_instance (loop_vec_info loop_vinf VEC (int, heap) *load_permutation; VEC (slp_tree, heap) *loads; struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); + bool loads_permuted = false; if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) { @@ -1250,7 +1361,7 @@ vect_analyze_slp_instance (loop_vec_info loop_vinf if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size, &inside_cost, &outside_cost, ncopies_for_cost, &max_nunits, &load_permutation, &loads, - vectorization_factor)) + vectorization_factor, &loads_permuted)) { /* Calculate the unrolling factor based on the smallest type. */ if (max_nunits > nunits) @@ -1275,7 +1386,8 @@ vect_analyze_slp_instance (loop_vec_info loop_vinf SLP_INSTANCE_LOADS (new_instance) = loads; SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL; SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation; - if (VEC_length (slp_tree, loads)) + + if (loads_permuted) { if (!vect_supported_load_permutation_p (new_instance, group_size, load_permutation)) @@ -2567,10 +2679,11 @@ vect_schedule_slp_instance (slp_tree node, slp_ins /* Loads should be inserted before the first load. */ if (SLP_INSTANCE_FIRST_LOAD_STMT (instance) && STMT_VINFO_STRIDED_ACCESS (stmt_info) - && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))) + && !REFERENCE_CLASS_P (gimple_get_lhs (stmt)) + && SLP_INSTANCE_LOAD_PERMUTATION (instance)) si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance)); else if (is_pattern_stmt_p (stmt_info)) - si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)); + si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)); else si = gsi_for_stmt (stmt);