vect_create_epilog_for_reduction had a variable called new_phis. It collected the statements that produce the exit block definitions of the vector reduction accumulators. Although those statements are indeed phis initially, they are often replaced with normal statements later, leading to puzzling code like:
FOR_EACH_VEC_ELT (new_phis, i, new_phi) { int bit_offset; if (gimple_code (new_phi) == GIMPLE_PHI) vec_temp = PHI_RESULT (new_phi); else vec_temp = gimple_assign_lhs (new_phi); Also, although the array collects statements, in practice all users want the lhs instead. This patch therefore replaces new_phis with a vector of gimple values called “reduc_inputs”. Also, reduction chains and ncopies>1 were handled with identical code (and there was a comment saying so). The patch unites them into a single “if”. gcc/ * tree-vect-loop.c (vect_create_epilog_for_reduction): Replace the new_phis vector with a reduc_inputs vector. Combine handling of reduction chains and ncopies > 1. --- gcc/tree-vect-loop.c | 113 ++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 72 deletions(-) diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 8390ac80ca0..b7f73ca52c7 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -5005,7 +5005,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, imm_use_iterator imm_iter, phi_imm_iter; use_operand_p use_p, phi_use_p; gimple *use_stmt; - auto_vec<gimple *> new_phis; + auto_vec<tree> reduc_inputs; int j, i; auto_vec<tree> scalar_results; unsigned int group_size = 1, k; @@ -5017,7 +5017,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, b2 = operation (b1) */ bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)); bool direct_slp_reduc; - tree new_phi_result; tree induction_index = NULL_TREE; if (slp_node) @@ -5215,7 +5214,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, if (double_reduc) loop = outer_loop; exit_bb = single_exit (loop)->dest; - new_phis.create (slp_node ? vec_num : ncopies); + reduc_inputs.create (slp_node ? vec_num : ncopies); for (unsigned i = 0; i < vec_num; i++) { if (slp_node) @@ -5223,19 +5222,14 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, else def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]); for (j = 0; j < ncopies; j++) - { + { tree new_def = copy_ssa_name (def); - phi = create_phi_node (new_def, exit_bb); - if (j == 0) - new_phis.quick_push (phi); - else - { - def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]); - new_phis.quick_push (phi); - } - - SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); - } + phi = create_phi_node (new_def, exit_bb); + if (j) + def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]); + SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); + reduc_inputs.quick_push (new_def); + } } exit_gsi = gsi_after_labels (exit_bb); @@ -5274,52 +5268,32 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, a2 = operation (a1) a3 = operation (a2), - we may end up with more than one vector result. Here we reduce them to - one vector. */ - if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc) + we may end up with more than one vector result. Here we reduce them + to one vector. + + The same is true if we couldn't use a single defuse cycle. */ + if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) + || direct_slp_reduc + || ncopies > 1) { gimple_seq stmts = NULL; - tree first_vect = PHI_RESULT (new_phis[0]); - first_vect = gimple_convert (&stmts, vectype, first_vect); - for (k = 1; k < new_phis.length (); k++) + tree first_vect = gimple_convert (&stmts, vectype, reduc_inputs[0]); + for (k = 1; k < reduc_inputs.length (); k++) { - gimple *next_phi = new_phis[k]; - tree second_vect = PHI_RESULT (next_phi); - second_vect = gimple_convert (&stmts, vectype, second_vect); + tree second_vect = gimple_convert (&stmts, vectype, reduc_inputs[k]); first_vect = gimple_build (&stmts, code, vectype, first_vect, second_vect); } gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); - new_phi_result = first_vect; - new_phis.truncate (0); - new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect)); + reduc_inputs.truncate (0); + reduc_inputs.safe_push (first_vect); } - /* Likewise if we couldn't use a single defuse cycle. */ - else if (ncopies > 1) - { - gimple_seq stmts = NULL; - tree first_vect = PHI_RESULT (new_phis[0]); - first_vect = gimple_convert (&stmts, vectype, first_vect); - for (int k = 1; k < ncopies; ++k) - { - tree second_vect = PHI_RESULT (new_phis[k]); - second_vect = gimple_convert (&stmts, vectype, second_vect); - first_vect = gimple_build (&stmts, code, vectype, - first_vect, second_vect); - } - gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); - new_phi_result = first_vect; - new_phis.truncate (0); - new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect)); - } - else - new_phi_result = PHI_RESULT (new_phis[0]); if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION && reduc_fn != IFN_LAST) { - /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing + /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing various data values where the condition matched and another vector (INDUCTION_INDEX) containing all the indexes of those matches. We need to extract the last matching index (which will be the index with @@ -5350,7 +5324,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, tree zero_vec = build_zero_cst (vectype); gimple_seq stmts = NULL; - new_phi_result = gimple_convert (&stmts, vectype, new_phi_result); + reduc_inputs[0] = gimple_convert (&stmts, vectype, reduc_inputs[0]); gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); /* Find maximum value from the vector of found indexes. */ @@ -5370,7 +5344,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes with the vector (INDUCTION_INDEX) of found indexes, choosing values - from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC) + from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC) otherwise. Only one value should match, resulting in a vector (VEC_COND) with one data value and the rest zeros. In the case where the loop never made any matches, every index will @@ -5389,7 +5363,8 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, zero. */ tree vec_cond = make_ssa_name (vectype); gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR, - vec_compare, new_phi_result, + vec_compare, + reduc_inputs[0], zero_vec); gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT); @@ -5437,7 +5412,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, val = data_reduc[i], idx_val = induction_index[i]; return val; */ - tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result)); + tree data_eltype = TREE_TYPE (TREE_TYPE (reduc_inputs[0])); tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index)); unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype)); poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index)); @@ -5461,7 +5436,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF, build3 (BIT_FIELD_REF, data_eltype, - new_phi_result, + reduc_inputs[0], bitsize_int (el_size), bitsize_int (off))); gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); @@ -5513,10 +5488,10 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, "Reduce using direct vector reduction.\n"); gimple_seq stmts = NULL; - new_phi_result = gimple_convert (&stmts, vectype, new_phi_result); - vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result)); + reduc_inputs[0] = gimple_convert (&stmts, vectype, reduc_inputs[0]); + vec_elem_type = TREE_TYPE (TREE_TYPE (reduc_inputs[0])); new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn), - vec_elem_type, new_phi_result); + vec_elem_type, reduc_inputs[0]); new_temp = gimple_convert (&stmts, scalar_type, new_temp); gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); @@ -5546,7 +5521,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, neutral value. We can then do a normal reduction on each vector. */ /* Enforced by vectorizable_reduction. */ - gcc_assert (new_phis.length () == 1); + gcc_assert (reduc_inputs.length () == 1); gcc_assert (pow2p_hwi (group_size)); slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis; @@ -5602,7 +5577,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, sel[j] = (index[j] == i); - which selects the elements of NEW_PHI_RESULT that should + which selects the elements of REDUC_INPUTS[0] that should be included in the result. */ tree compare_val = build_int_cst (index_elt_type, i); compare_val = build_vector_from_val (index_type, compare_val); @@ -5611,11 +5586,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* Calculate the equivalent of: - vec = seq ? new_phi_result : vector_identity; + vec = seq ? reduc_inputs[0] : vector_identity; VEC is now suitable for a full vector reduction. */ tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype, - sel, new_phi_result, vector_identity); + sel, reduc_inputs[0], vector_identity); /* Do the reduction and convert it to the appropriate type. */ tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn), @@ -5630,7 +5605,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, bool reduce_with_shift; tree vec_temp; - gcc_assert (slp_reduc || new_phis.length () == 1); + gcc_assert (slp_reduc || reduc_inputs.length () == 1); /* See if the target wants to do the final (shift) reduction in a vector mode of smaller size and first reduce upper/lower @@ -5640,7 +5615,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); unsigned nunits1 = nunits; if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode - && new_phis.length () == 1) + && reduc_inputs.length () == 1) { nunits1 = GET_MODE_NUNITS (mode1).to_constant (); /* For SLP reductions we have to make sure lanes match up, but @@ -5672,7 +5647,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* First reduce the vector to the desired vector size we should do shift reduction on by combining upper and lower halves. */ - new_temp = new_phi_result; + new_temp = reduc_inputs[0]; while (nunits > nunits1) { nunits /= 2; @@ -5751,7 +5726,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, new_temp = make_ssa_name (vectype1); epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2); gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - new_phis[0] = epilog_stmt; + reduc_inputs[0] = new_temp; } if (reduce_with_shift && !slp_reduc) @@ -5832,13 +5807,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, int element_bitsize = tree_to_uhwi (bitsize); tree compute_type = TREE_TYPE (vectype); gimple_seq stmts = NULL; - FOR_EACH_VEC_ELT (new_phis, i, new_phi) + FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp) { int bit_offset; - if (gimple_code (new_phi) == GIMPLE_PHI) - vec_temp = PHI_RESULT (new_phi); - else - vec_temp = gimple_assign_lhs (new_phi); new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type, vec_temp, bitsize, bitsize_zero_node); @@ -5929,11 +5900,10 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, gimple_seq stmts = NULL; if (double_reduc) { - new_phi = new_phis[0]; gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def))); adjustment_def = gimple_convert (&stmts, vectype, adjustment_def); new_temp = gimple_build (&stmts, code, vectype, - PHI_RESULT (new_phi), adjustment_def); + reduc_inputs[0], adjustment_def); } else { @@ -5947,7 +5917,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, epilog_stmt = gimple_seq_last_stmt (stmts); gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); scalar_results[0] = new_temp; - new_phis[0] = epilog_stmt; } if (double_reduc)