https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109088
--- Comment #15 from JuzheZhong <juzhe.zhong at rivai dot ai> --- Hi,Richard. Confirmed Robin's patch doesn't help with this issue. The root cause of this issue is failed to recognize it as possible vectorization of conditional reduction which means is_cond_scalar_reduction is FALSE. I have this following patch which bootstrap on X86 and regtest passed also passed on aarch64. This following patch can enhance if-conv conditional reduction recognition. diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc index a8c915913ae..2bdd3710a65 100644 --- a/gcc/tree-if-conv.cc +++ b/gcc/tree-if-conv.cc @@ -1784,14 +1784,119 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1, r_nop2 = strip_nop_cond_scalar_reduction (*has_nop, r_op2); /* Make R_OP1 to hold reduction variable. */ + gimple *nonphi_use_stmt = NULL; if (r_nop2 == PHI_RESULT (header_phi) && commutative_tree_code (reduction_op)) { std::swap (r_op1, r_op2); std::swap (r_nop1, r_nop2); } - else if (r_nop1 != PHI_RESULT (header_phi)) - return false; + else if (r_nop1 == PHI_RESULT (header_phi)) + ; + else + { + /* Analyze the statement chain of STMT so that we could teach generate + better if-converison code sequence. We are trying to catch this + following situation: + + loop-header: + reduc_1 = PHI <..., reduc_2> + ... + if (...) + tmp1 = reduc_1 + rhs1; + tmp2 = tmp1 + rhs2; + tmp3 = tmp2 + rhs3; + ... + reduc_3 = tmpN-1 + rhsN-1; + + reduc_2 = PHI <reduc_1, reduc_3> + + and convert to + + reduc_2 = PHI <0, reduc_1> + tmp1 = rhs1; + tmp2 = tmp1 + rhs2; + tmp3 = tmp2 + rhs3; + ... + reduc_3 = tmpN-1 + rhsN-1; + ifcvt = cond_expr ? reduc_3 : 0; + reduc_1 = reduc_1 +/- ifcvt; */ + if (num_imm_uses (PHI_RESULT (header_phi)) != 2) + return false; + if (!ANY_INTEGRAL_TYPE_P (TREE_TYPE (PHI_RESULT (phi))) + && !(FLOAT_TYPE_P (TREE_TYPE (PHI_RESULT (phi))) + && !HONOR_SIGNED_ZEROS (TREE_TYPE (PHI_RESULT (phi))) + && !HONOR_SIGN_DEPENDENT_ROUNDING (TREE_TYPE (PHI_RESULT (phi))) + && !HONOR_NANS (TREE_TYPE (PHI_RESULT (phi))))) + return false; + gimple *prev_use_stmt, *curr_use_stmt; + use_operand_p use; + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, PHI_RESULT (header_phi)) + { + prev_use_stmt = curr_use_stmt = USE_STMT (use_p); + if (is_gimple_assign (curr_use_stmt)) + { + if (TREE_CODE (gimple_assign_lhs (curr_use_stmt)) != SSA_NAME) + return false; + if (*has_nop) + { + if (!CONVERT_EXPR_CODE_P ( + gimple_assign_rhs_code (curr_use_stmt))) + return false; + } + else + { + if (gimple_assign_rhs_code (curr_use_stmt) != reduction_op) + return false; + } + + bool visited_p = false; + nonphi_use_stmt = curr_use_stmt; + while (!visited_p) + { + if (!single_imm_use (gimple_assign_lhs (prev_use_stmt), &use, + &curr_use_stmt) + || gimple_bb (curr_use_stmt) != gimple_bb (stmt) + || !is_gimple_assign (curr_use_stmt) + || TREE_CODE (gimple_assign_lhs (curr_use_stmt)) + != SSA_NAME + || gimple_assign_rhs_code (curr_use_stmt) != reduction_op) + return false; + if (curr_use_stmt == stmt) + { + if (*has_nop) + { + if (!single_imm_use (gimple_assign_lhs ( + nonphi_use_stmt), + &use, &curr_use_stmt)) + return false; + r_op1 = gimple_assign_lhs (nonphi_use_stmt); + r_nop1 = PHI_RESULT (header_phi); + nonphi_use_stmt = curr_use_stmt; + } + else + r_op1 = PHI_RESULT (header_phi); + + if (*has_nop) + { + if (!single_imm_use (gimple_assign_lhs (stmt), &use, + &curr_use_stmt)) + return false; + r_op2 = gimple_assign_lhs (stmt); + r_nop2 = gimple_assign_lhs (curr_use_stmt); + } + else + r_op2 = gimple_assign_lhs (stmt); + visited_p = true; + } + else + prev_use_stmt = curr_use_stmt; + } + } + else if (curr_use_stmt != phi) + return false; + } + } if (*has_nop) { @@ -1816,12 +1921,41 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1, continue; if (use_stmt == stmt) continue; + if (use_stmt == nonphi_use_stmt) + continue; if (gimple_code (use_stmt) != GIMPLE_PHI) return false; } *op0 = r_op1; *op1 = r_op2; *reduc = stmt; + + if (nonphi_use_stmt) + { + /* Transform: + + if (...) + tmp1 = reduc_1 + rhs1; + tmp2 = tmp1 + rhs2; + tmp3 = tmp2 + rhs3; + + into: + + tmp1 = rhs1 + 0; ---> We replace reduc_1 into '0' + tmp2 = tmp1 + rhs2; + tmp3 = tmp2 + rhs3; + ... + reduc_3 = tmpN-1 + rhsN-1; + ifcvt = cond_expr ? reduc_3 : 0; */ + gcc_assert (gimple_assign_rhs_code (nonphi_use_stmt) == reduction_op); + if (gimple_assign_rhs1 (nonphi_use_stmt) == r_op1) + gimple_assign_set_rhs1 (nonphi_use_stmt, + build_zero_cst (TREE_TYPE (r_op1))); + else if (gimple_assign_rhs2 (nonphi_use_stmt) == r_op1) + gimple_assign_set_rhs2 (nonphi_use_stmt, + build_zero_cst (TREE_TYPE (r_op1))); + update_stmt (nonphi_use_stmt); + } return true; } @@ -1886,12 +2020,17 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi, gsi_remove (&stmt_it, true); release_defs (nop_reduc); } + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); /* Delete original reduction stmt. */ - stmt_it = gsi_for_stmt (reduc); - gsi_remove (&stmt_it, true); - release_defs (reduc); + if (op1 != gimple_assign_lhs (reduc)) + { + stmt_it = gsi_for_stmt (reduc); + gsi_remove (&stmt_it, true); + release_defs (reduc); + } + return rhs; } I have fully tested it with different kinds of condtional reduction. All can be vectorized. I am not sure whether it is a feasible solution.