In PR117830 a miscompilation of 464.h264ref was reported. An analysis showed that wrong code was generated because of unsatisfied assuptions in the code. This patch addresses these issues.
The first assuption was that we can independently analyse the two vec-perms at the start of a vec-perm-simplify sequence and use the information later for calculating a final vec-perm selector that utilizes less lanes. However, this information does not help much, because for changing the selector entry, we need to ensure that both elements of the operand vectors v_1 and v_2 remain equal. This is addressed by removing the function get_vect_selector_index_map and checking for this equality in the loop where we create the new selector. The calculation of the selector vector for the blended sequence assumed that the indices of the selector vector of the narrowed sequences are increasing. This assuption does not hold in general. This was fixed by allowing a wrap-around when searching for an empty lane. Further, there was an issue in the calculation of the selector vector entries for the second sequence. The code did not consider that the lanes of the second sequence could have been moved. A relevant property of this patch is, that it introduces a couple of nested loops, where the out loop iterates from i=0..nelts and the inner loop iterates from j=0..i. To avoid performance concerns, a check is introduced that ensures nelts won't exceed 4 lanes. The added test case is derived from h264ref (the other cases from the benchmark have the same structure and don't provide additional coverage). Bootstrapped and regression-tested on x86-64 and aarch64. Further, tested on CPU 2006 h264ref and CPU 2017 x264. PR117830 gcc/ChangeLog: * tree-ssa-forwprop.cc (get_vect_selector_index_map): (recognise_vec_perm_simplify_seq): (calc_perm_vec_perm_simplify_seqs): (process_vec_perm_simplify_seq_list): gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/vector-11.c: New test. Signed-off-by: Christoph Müllner <christoph.muell...@vrull.eu> --- gcc/testsuite/gcc.dg/tree-ssa/vector-11.c | 38 ++++ gcc/tree-ssa-forwprop.cc | 203 +++++++++++++--------- 2 files changed, 162 insertions(+), 79 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/vector-11.c diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vector-11.c b/gcc/testsuite/gcc.dg/tree-ssa/vector-11.c new file mode 100644 index 00000000000..e4102d318d2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/vector-11.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-forwprop1-details -Wno-psabi" } */ + +typedef int vec __attribute__((vector_size (4 * sizeof (int)))); + +void f1 (vec *p_v_in, vec *p_v_out_1, vec *p_v_out_2) +{ + vec sel00 = { 2, 3, 2, 2 }; + vec sel01 = { 1, 0, 1, 1 }; + vec sel10 = { 3, 2, 3, 3 }; + vec sel11 = { 0, 1, 0, 0 }; + vec sel = { 0, 5, 2, 7 }; + vec v_1, v_2, v_x, v_y, v_out_1, v_out_2; + vec v_in = *p_v_in; + + /* First vec perm sequence. */ + v_1 = __builtin_shuffle (v_in, v_in, sel00); + v_2 = __builtin_shuffle (v_in, v_in, sel01); + v_x = v_2 - v_1; + v_y = v_1 + v_2; + v_out_1 = __builtin_shuffle (v_y, v_x, sel); + + /* Second vec perm sequence. */ + v_1 = __builtin_shuffle (v_in, v_in, sel10); + v_2 = __builtin_shuffle (v_in, v_in, sel11); + v_x = v_2 - v_1; + v_y = v_1 + v_2; + v_out_2 = __builtin_shuffle (v_y, v_x, sel); + + /* Won't blend because the narrowed sequence + utilizes three of the four lanes. */ + + *p_v_out_1 = v_out_1; + *p_v_out_2 = v_out_2; +} + +/* { dg-final { scan-tree-dump "Vec perm simplify sequences have been blended" "forwprop1" { target { aarch64*-*-* i?86-*-* x86_64-*-* } } } } */ +/* { dg-final { scan-tree-dump "VEC_PERM_EXPR.*{ 2, 7, 2, 6 }" "forwprop1" { target { aarch64*-*-* i?86-*-* x86_64-*-* } } } } */ diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc index 7cae08f0d79..dae8c2f435b 100644 --- a/gcc/tree-ssa-forwprop.cc +++ b/gcc/tree-ssa-forwprop.cc @@ -3479,41 +3479,6 @@ fwprop_ssa_val (tree name) return name; } -/* Get an index map from the provided vector permute selector - and return the number of unique indices. - E.g.: { 1, 3, 1, 3 } -> <0, 1, 0, 1>, 2 - { 0, 2, 0, 2 } -> <0, 1, 0, 1>, 2 - { 3, 2, 1, 0 } -> <0, 1, 2, 3>, 4. */ - -static unsigned int -get_vect_selector_index_map (tree sel, vec<unsigned int> *index_map) -{ - gcc_assert (VECTOR_CST_NELTS (sel).is_constant ()); - unsigned int nelts = VECTOR_CST_NELTS (sel).to_constant (); - unsigned int n = 0; - - for (unsigned int i = 0; i < nelts; i++) - { - /* Extract the i-th value from the selector. */ - tree sel_cst_tree = VECTOR_CST_ELT (sel, i); - unsigned int sel_cst = TREE_INT_CST_LOW (sel_cst_tree); - - unsigned int j = 0; - for (; j <= i; j++) - { - tree prev_sel_cst_tree = VECTOR_CST_ELT (sel, j); - unsigned int prev_sel_cst - = TREE_INT_CST_LOW (prev_sel_cst_tree); - if (prev_sel_cst == sel_cst) - break; - } - index_map->quick_push (j); - n += (i == j) ? 1 : 0; - } - - return n; -} - /* Search for opportunities to free half of the lanes in the following pattern: v_in = {e0, e1, e2, e3} @@ -3562,6 +3527,10 @@ recognise_vec_perm_simplify_seq (gassign *stmt, vec_perm_simplify_seq *seq) unsigned int nelts = VECTOR_CST_NELTS (sel).to_constant (); + /* Don't analyse sequences with many lanes. */ + if (nelts > 4) + return false; + /* Lookup the definition of v_x and v_y. */ gassign *v_x_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_x)); gassign *v_y_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_y)); @@ -3632,42 +3601,47 @@ recognise_vec_perm_simplify_seq (gassign *stmt, vec_perm_simplify_seq *seq) || nelts != VECTOR_CST_NELTS (v_2_sel).to_constant ()) return false; - /* Now check permutation selection operands. */ - auto_vec<unsigned int> v_1_lane_map, v_2_lane_map; - v_1_lane_map.reserve (nelts); - v_2_lane_map.reserve (nelts); - unsigned int v_1_lanes, v_2_lanes; - v_1_lanes = get_vect_selector_index_map (v_1_sel, &v_1_lane_map); - v_2_lanes = get_vect_selector_index_map (v_2_sel, &v_2_lane_map); - - /* Check if we could free up half of the lanes. */ - if (v_1_lanes != v_2_lanes || v_1_lanes > (nelts / 2)) - return false; - /* Create the new selector. */ vec_perm_builder new_sel_perm (nelts, nelts, 1); + auto_vec<unsigned int> lanes (nelts); + lanes.quick_grow_cleared (nelts); for (unsigned int i = 0; i < nelts; i++) { /* Extract the i-th value from the selector. */ - tree sel_cst_tree = VECTOR_CST_ELT (sel, i); - unsigned int sel_cst = TREE_INT_CST_LOW (sel_cst_tree); - - unsigned int j; - if (sel_cst < nelts) - j = v_1_lane_map[sel_cst]; - else - j = v_2_lane_map[sel_cst - nelts] + nelts; + unsigned int sel_cst = TREE_INT_CST_LOW (VECTOR_CST_ELT (sel, i)); + unsigned int lane = sel_cst % nelts; + unsigned int offs = sel_cst / nelts; - new_sel_perm.quick_push (j); + /* Check what's in the lane. */ + unsigned int e_1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (v_1_sel, lane)); + unsigned int e_2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (v_2_sel, lane)); - if (dump_file && (dump_flags & TDF_DETAILS)) + /* Reuse previous lane (if any). */ + unsigned int l = 0; + for (; l < lane; l++) { - fprintf (dump_file, "%u", j); - if (i != (nelts -1)) - fprintf (dump_file, ", "); + if ((TREE_INT_CST_LOW (VECTOR_CST_ELT (v_1_sel, l)) == e_1) + && (TREE_INT_CST_LOW (VECTOR_CST_ELT (v_2_sel, l)) == e_2)) + break; } + + /* Add to narrowed selector. */ + new_sel_perm.quick_push (l + offs * nelts); + + /* Mark lane as used. */ + lanes[l] = 1; } + /* Count how many lanes are need. */ + unsigned int cnt = 0; + for (unsigned int i = 0; i < nelts; i++) + cnt += lanes[i]; + + /* If more than (nelts/2) lanes are needed, skip the sequence. */ + if (cnt > nelts / 2) + return false; + + /* Check if the resulting permuation is cheap. */ vec_perm_indices new_indices (new_sel_perm, 2, nelts); tree vectype = TREE_TYPE (gimple_assign_lhs (stmt)); machine_mode vmode = TYPE_MODE (vectype); @@ -3685,8 +3659,15 @@ recognise_vec_perm_simplify_seq (gassign *stmt, vec_perm_simplify_seq *seq) if (dump_file) { - fprintf (dump_file, "Found vec perm simplify sequence ending with: "); + fprintf (dump_file, "Found vec perm simplify sequence ending with:\n\t"); print_gimple_stmt (dump_file, stmt, 0); + + if (dump_flags & TDF_DETAILS) + { + fprintf (dump_file, "\tNarrowed vec_perm selector: "); + print_generic_expr (dump_file, (*seq)->new_sel); + fprintf (dump_file, "\n"); + } } return true; @@ -3805,29 +3786,66 @@ calc_perm_vec_perm_simplify_seqs (vec_perm_simplify_seq seq1, unsigned int i; unsigned int nelts = seq1->nelts; auto_vec<int> lane_assignment; - lane_assignment.create (2 * nelts); + lane_assignment.create (nelts); /* Mark all lanes as free. */ - lane_assignment.quick_grow_cleared (2 * nelts); + lane_assignment.quick_grow_cleared (nelts); - /* Reserve lanes for seq1. */ + /* Allocate lanes for seq1. */ for (i = 0; i < nelts; i++) { unsigned int l = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq1->new_sel, i)); + l %= nelts; lane_assignment[l] = 1; - } +} - /* Reserve lanes for seq2 and calculate selector for seq2->stmt. */ + /* Allocate lanes for seq2 and calculate selector for seq2->stmt. */ vec_perm_builder seq2_stmt_sel_perm (nelts, nelts, 1); for (i = 0; i < nelts; i++) { - unsigned int l = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, i)); - while (lane_assignment[l] != 0) - l++; - lane_assignment[l] = 2; - seq2_stmt_sel_perm.quick_push (l); + unsigned int sel = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, i)); + unsigned int lane = sel % nelts; + unsigned int offs = sel / nelts; + unsigned int new_sel; + + /* Check if we already allocated the lane for seq2. */ + unsigned int j = 0; + for (; j < i; j++) + { + unsigned int sel_old; + sel_old = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, j)); + unsigned int lane_old = sel_old % nelts; + if (lane == lane_old) + { + new_sel = seq2_stmt_sel_perm[j].to_constant (); + new_sel = (new_sel % nelts) + offs * nelts; + break; + } + } + + /* If the lane is not allocated, we need to do that now. */ + if (j == i) + { + unsigned int l_orig = lane; + while (lane_assignment[lane] != 0) + { + lane = (lane + 1) % nelts; + + /* This should not happen if both sequences utilize no more than + half of the lanes. Test anyway to guarantee termination. */ + if (lane == l_orig) + return false; + } + + /* Allocate lane. */ + lane_assignment[lane] = 2; + new_sel = lane + offs * nelts; + } + + seq2_stmt_sel_perm.quick_push (new_sel); } + /* Check if the resulting permuation is cheap. */ seq2_stmt_indices->new_vector (seq2_stmt_sel_perm, 2, nelts); tree vectype = TREE_TYPE (gimple_assign_lhs (seq2->stmt)); machine_mode vmode = TYPE_MODE (vectype); @@ -3839,13 +3857,40 @@ calc_perm_vec_perm_simplify_seqs (vec_perm_simplify_seq seq1, vec_perm_builder seq1_v_2_stmt_sel_perm (nelts, nelts, 1); for (i = 0; i < nelts; i++) { - bool use_seq1 = lane_assignment[i] == 1; - tree s1 = gimple_assign_rhs3 (use_seq1 ? seq1->v_1_stmt - : seq2->v_1_stmt); - tree s2 = gimple_assign_rhs3 (use_seq1 ? seq1->v_2_stmt - : seq2->v_2_stmt); - unsigned int l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, i)) % nelts; - unsigned int l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, i)) % nelts; + bool use_seq1 = lane_assignment[i] != 2; + unsigned int l1, l2; + + if (use_seq1) + { + /* Just reuse the selector indices. */ + tree s1 = gimple_assign_rhs3 (seq1->v_1_stmt); + tree s2 = gimple_assign_rhs3 (seq1->v_2_stmt); + l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, i)); + l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, i)); + } + else + { + /* We moved the lanes for seq2, so we need to adjust for that. */ + tree s1 = gimple_assign_rhs3 (seq2->v_1_stmt); + tree s2 = gimple_assign_rhs3 (seq2->v_2_stmt); + + unsigned int j = 0; + for (; j < i; j++) + { + unsigned int sel_new; + sel_new = seq2_stmt_sel_perm[j].to_constant (); + sel_new %= nelts; + if (sel_new == i) + break; + } + + /* This should not happen. Test anyway to guarantee correctness. */ + if (j == i) + return false; + + l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, j)); + l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, j)); + } seq1_v_1_stmt_sel_perm.quick_push (l1 + (use_seq1 ? 0 : nelts)); seq1_v_2_stmt_sel_perm.quick_push (l2 + (use_seq1 ? 0 : nelts)); @@ -3960,7 +4005,7 @@ process_vec_perm_simplify_seq_list (vec<vec_perm_simplify_seq> *l) return; if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "Processing %u vec perm simplify sequences.\n", + fprintf (dump_file, "\nProcessing %u vec perm simplify sequences.\n", l->length ()); FOR_EACH_VEC_ELT (*l, i, seq1) -- 2.47.1