On Wed, May 26, 2021 at 8:41 PM Richard Biener <richard.guent...@gmail.com> wrote: > > On Wed, May 26, 2021 at 7:06 AM Hongtao Liu <crazy...@gmail.com> wrote: > > > > On Tue, May 25, 2021 at 6:24 PM Richard Biener > > <richard.guent...@gmail.com> wrote: > > > > > > On Mon, May 24, 2021 at 11:52 AM Hongtao Liu <crazy...@gmail.com> wrote: > > > > > > > > Hi: > > > > Details described in PR. > > > > Bootstrapped and regtest on > > > > x86_64-linux-gnu{-m32,}/x86_64-linux-gnu{-m32\ > > > > -march=cascadelake,-march=cascadelake} > > > > Ok for trunk? > > > > > > +static tree > > > +strip_nop_cond_scalar_reduction (bool has_nop, tree op) > > > +{ > > > + if (!has_nop) > > > + return op; > > > + > > > + if (TREE_CODE (op) != SSA_NAME) > > > + return NULL_TREE; > > > + > > > + gimple* stmt = SSA_NAME_DEF_STMT (op); > > > + if (!stmt > > > + || gimple_code (stmt) != GIMPLE_ASSIGN > > > + || gimple_has_volatile_ops (stmt) > > > + || gimple_assign_rhs_code (stmt) != NOP_EXPR) > > > + return NULL_TREE; > > > + > > > + return gimple_assign_rhs1 (stmt); > > > > > > this allows arbitrary conversions where the comment suggests you > > > only want to allow conversions to the same precision but different sign. > > > Sth like > > > > > > gassign *stmt = safe_dyn_cast <gassign *> (SSA_NAME_DEF_STMT (op)); > > > if (!stmt > > > || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)) > > > || !tree_nop_conversion_p (TREE_TYPE (op), TREE_TYPE > > > (gimple_assign_rhs1 (stmt)))) > > > return NULL_TREE; > > > Changed. > > > + if (gimple_bb (stmt) != gimple_bb (*nop_reduc) > > > + || gimple_code (stmt) != GIMPLE_ASSIGN > > > + || gimple_has_volatile_ops (stmt)) > > > + return false; > > > > > > !is_gimple_assign (stmt) instead of gimple_code (stmt) != GIMPLE_ASSIGN > > > Changed. > > > the gimple_has_volatile_ops check is superfluous given you restrict > > > the assign code. > > > Changed. > > > + /* Check that R_NOP1 is used in nop_stmt or in PHI only. */ > > > + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, r_nop1) > > > + { > > > + gimple *use_stmt = USE_STMT (use_p); > > > + if (is_gimple_debug (use_stmt)) > > > + continue; > > > + if (use_stmt == SSA_NAME_DEF_STMT (r_op1)) > > > + continue; > > > + if (gimple_code (use_stmt) != GIMPLE_PHI) > > > + return false; > > > > > > can the last check be use_stmt == phi since we should have the > > > PHI readily available? > > > Yes, changed. > > > @@ -1735,6 +1822,23 @@ convert_scalar_cond_reduction (gimple *reduc, > > > gimple_stmt_iterator *gsi, > > > rhs = fold_build2 (gimple_assign_rhs_code (reduc), > > > TREE_TYPE (rhs1), op0, tmp); > > > > > > + if (has_nop) > > > + { > > > + /* Create assignment nop_rhs = op0 +/- _ifc_. */ > > > + tree nop_rhs = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, > > > "_nop_"); > > > + gimple* new_assign2 = gimple_build_assign (nop_rhs, rhs); > > > + gsi_insert_before (gsi, new_assign2, GSI_SAME_STMT); > > > + /* Rebuild rhs for nop_expr. */ > > > + rhs = fold_build1 (NOP_EXPR, > > > + TREE_TYPE (gimple_assign_lhs (nop_reduc)), > > > + nop_rhs); > > > + > > > + /* Delete nop_reduc. */ > > > + stmt_it = gsi_for_stmt (nop_reduc); > > > + gsi_remove (&stmt_it, true); > > > + release_defs (nop_reduc); > > > + } > > > + > > > > > > hmm, the whole function could be cleaned up with sth like > > > > > > /* Build rhs for unconditional increment/decrement. */ > > > gimple_seq stmts = NULL; > > > rhs = gimple_build (&stmts, gimple_assing_rhs_code (reduc), > > > TREE_TYPE (rhs1), op0, tmp); > > > if (has_nop) > > > rhs = gimple_convert (&stmts, TREE_TYPE (gimple_assign_lhs > > > (nop_reduc)), rhs); > > > gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); > > > > > > plus in the caller moving the > > > > > > new_stmt = gimple_build_assign (res, rhs); > > > gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); > > > > > > to the else branch as well as the folding done on new_stmt (maybe return > > > new_stmt instead of rhs from convert_scalar_cond_reduction. > > Eventually, we needed to assign rhs to res, and with an extra mov stmt > > from rhs to res, the vectorizer failed. > > the only difference in 166t.ifcvt between successfully vectorization > > and failed vectorization is below > > char * _24; > > char _25; > > unsigned char _ifc__29; > > + unsigned char _30; > > > > <bb 2> [local count: 118111600]: > > if (n_10(D) != 0) > > @@ -70,7 +71,8 @@ char foo2 (char * a, char * c, int n) > > _5 = c_14(D) + _1; > > _6 = *_5; > > _ifc__29 = _3 == _6 ? 1 : 0; > > - cnt_7 = cnt_18 + _ifc__29; > > + _30 = cnt_18 + _ifc__29; > > + cnt_7 = _30; > > i_16 = i_20 + 1; > > if (n_10(D) != i_16) > > goto <bb 9>; [89.00%] > > @@ -110,7 +112,7 @@ char foo2 (char * a, char * c, int n) > > goto <bb 12>; [100.00%] > > > > <bb 6> [local count: 105119324]: > > - # cnt_19 = PHI <cnt_7(3), cnt_27(15)> > > + # cnt_19 = PHI <_30(3), cnt_27(15)> > > _21 = (char) cnt_19; > > > > if we want to eliminate the extra move, gimple_build and > > gimple_convert is not suitable since they create a new lhs, is there > > any interface like gimple_build_assign but accept stmts? > > Hmm, OK. So what you could do is replacing all uses of 'res' > with 'rhs', alternatively replacing the last generated stmt LHS > by 'res'. Both is a bit hackish of course. Usually the vectorizer > just ignores copies like this but the reduction matching is > unnecessarily strict here (but it's also a bit awkward to fix there). > > There's redundant_ssa_names which seems to be designed to > handle propagating those out and there's the do_rpo_vn run, > so I wonder why the stmt remains. Now, for redundant_ssa_names > you'd need to push a std::pair (res, rhs) to it in case rhs is an > SSA name - does that help? Yes, it worked. > > Richard. > > > > > > Richard. > > > > > > > gcc/ChangeLog: > > > > > > > > PR tree-optimization/pr98365 > > > > * tree-if-conv.c (strip_nop_cond_scalar_reduction): New > > > > function. > > > > (is_cond_scalar_reduction): Handle nop_expr in cond scalar > > > > reduction. > > > > (convert_scalar_cond_reduction): Ditto. > > > > (predicate_scalar_phi): Ditto. > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > PR tree-optimization/pr98365 > > > > * gcc.target/i386/pr98365.c: New test. > > > > > > > > -- > > > > BR, > > > > Hongtao > > > > > > > > -- > > BR, > > Hongtao
Update patch. -- BR, Hongtao
From f0593d1866af6f752e70e2ef2f1477471c252536 Mon Sep 17 00:00:00 2001 From: liuhongt <hongtao....@intel.com> Date: Wed, 6 Jan 2021 16:33:27 +0800 Subject: [PATCH] Extend is_cond_scalar_reduction to handle nop_expr after/before scalar reduction.[PR98365] gcc/ChangeLog: PR tree-optimization/pr98365 * tree-if-conv.c (strip_nop_cond_scalar_reduction): New function. (is_cond_scalar_reduction): Handle nop_expr in cond scalar reduction. (convert_scalar_cond_reduction): Ditto. (predicate_scalar_phi): Ditto. gcc/testsuite/ChangeLog: PR tree-optimization/pr98365 * gcc.target/i386/pr98365.c: New test. --- gcc/testsuite/gcc.target/i386/pr98365.c | 22 ++++ gcc/tree-if-conv.c | 142 +++++++++++++++++++++--- 2 files changed, 146 insertions(+), 18 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr98365.c diff --git a/gcc/testsuite/gcc.target/i386/pr98365.c b/gcc/testsuite/gcc.target/i386/pr98365.c new file mode 100644 index 00000000000..652210dcdd5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr98365.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fdump-tree-vect-details" } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ +short foo1 (short* a, short* c, int n) +{ + int i; + short cnt=0; + for (int i = 0;i != n; i++) + if (a[i] == c[i]) + cnt++; + return cnt; +} + +char foo2 (char* a, char* c, int n) +{ + int i; + char cnt=0; + for (int i = 0;i != n; i++) + if (a[i] == c[i]) + cnt++; + return cnt; +} diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c index 716eae44a21..5eed087de36 100644 --- a/gcc/tree-if-conv.c +++ b/gcc/tree-if-conv.c @@ -1579,6 +1579,31 @@ if_convertible_loop_p (class loop *loop) return res; } +/* Return reduc_1 if has_nop. + + if (...) + tmp1 = (unsigned type) reduc_1; + tmp2 = tmp1 + rhs2; + reduc_3 = (signed type) tmp2. */ +static tree +strip_nop_cond_scalar_reduction (bool has_nop, tree op) +{ + if (!has_nop) + return op; + + if (TREE_CODE (op) != SSA_NAME) + return NULL_TREE; + + gassign *stmt = safe_dyn_cast <gassign *> (SSA_NAME_DEF_STMT (op)); + if (!stmt + || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)) + || !tree_nop_conversion_p (TREE_TYPE (op), TREE_TYPE + (gimple_assign_rhs1 (stmt)))) + return NULL_TREE; + + return gimple_assign_rhs1 (stmt); +} + /* Returns true if def-stmt for phi argument ARG is simple increment/decrement which is in predicated basic block. In fact, the following PHI pattern is searching: @@ -1595,9 +1620,10 @@ if_convertible_loop_p (class loop *loop) static bool is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1, - tree *op0, tree *op1, bool extended) + tree *op0, tree *op1, bool extended, bool* has_nop, + gimple **nop_reduc) { - tree lhs, r_op1, r_op2; + tree lhs, r_op1, r_op2, r_nop1, r_nop2; gimple *stmt; gimple *header_phi = NULL; enum tree_code reduction_op; @@ -1608,7 +1634,7 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1, use_operand_p use_p; edge e; edge_iterator ei; - bool result = false; + bool result = *has_nop = false; if (TREE_CODE (arg_0) != SSA_NAME || TREE_CODE (arg_1) != SSA_NAME) return false; @@ -1656,18 +1682,77 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1, return false; reduction_op = gimple_assign_rhs_code (stmt); + + /* Catch something like below + + loop-header: + reduc_1 = PHI <..., reduc_2> + ... + if (...) + tmp1 = (unsigned type) reduc_1; + tmp2 = tmp1 + rhs2; + reduc_3 = (signed type) tmp2; + + reduc_2 = PHI <reduc_1, reduc_3> + + and convert to + + reduc_2 = PHI <0, reduc_3> + tmp1 = (unsigned type)reduce_1; + ifcvt = cond_expr ? rhs2 : 0 + tmp2 = tmp1 +/- ifcvt; + reduce_1 = (signed type)tmp2; */ + + if (reduction_op == NOP_EXPR) + { + lhs = gimple_assign_rhs1 (stmt); + if (TREE_CODE (lhs) != SSA_NAME + || !has_single_use (lhs)) + return false; + + *nop_reduc = stmt; + stmt = SSA_NAME_DEF_STMT (lhs); + if (gimple_bb (stmt) != gimple_bb (*nop_reduc) + || !is_gimple_assign (stmt)) + return false; + + *has_nop = true; + reduction_op = gimple_assign_rhs_code (stmt); + } + if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR) return false; r_op1 = gimple_assign_rhs1 (stmt); r_op2 = gimple_assign_rhs2 (stmt); + r_nop1 = strip_nop_cond_scalar_reduction (*has_nop, r_op1); + r_nop2 = strip_nop_cond_scalar_reduction (*has_nop, r_op2); + /* Make R_OP1 to hold reduction variable. */ - if (r_op2 == PHI_RESULT (header_phi) + if (r_nop2 == PHI_RESULT (header_phi) && reduction_op == PLUS_EXPR) - std::swap (r_op1, r_op2); - else if (r_op1 != PHI_RESULT (header_phi)) + { + std::swap (r_op1, r_op2); + std::swap (r_nop1, r_nop2); + } + else if (r_nop1 != PHI_RESULT (header_phi)) return false; + if (*has_nop) + { + /* Check that R_NOP1 is used in nop_stmt or in PHI only. */ + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, r_nop1) + { + gimple *use_stmt = USE_STMT (use_p); + if (is_gimple_debug (use_stmt)) + continue; + if (use_stmt == SSA_NAME_DEF_STMT (r_op1)) + continue; + if (use_stmt != phi) + return false; + } + } + /* Check that R_OP1 is used in reduction stmt or in PHI only. */ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, r_op1) { @@ -1705,7 +1790,8 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1, static tree convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi, - tree cond, tree op0, tree op1, bool swap) + tree cond, tree op0, tree op1, bool swap, + bool has_nop, gimple* nop_reduc) { gimple_stmt_iterator stmt_it; gimple *new_assign; @@ -1714,6 +1800,7 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi, tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_"); tree c; tree zero = build_zero_cst (TREE_TYPE (rhs1)); + gimple_seq stmts = NULL; if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -1732,8 +1819,18 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi, new_assign = gimple_build_assign (tmp, c); gsi_insert_before (gsi, new_assign, GSI_SAME_STMT); /* Build rhs for unconditional increment/decrement. */ - rhs = fold_build2 (gimple_assign_rhs_code (reduc), - TREE_TYPE (rhs1), op0, tmp); + rhs = gimple_build (&stmts, gimple_assign_rhs_code (reduc), + TREE_TYPE (rhs1), op0, tmp); + + if (has_nop) + { + rhs = gimple_convert (&stmts, + TREE_TYPE (gimple_assign_lhs (nop_reduc)), rhs); + stmt_it = gsi_for_stmt (nop_reduc); + gsi_remove (&stmt_it, true); + release_defs (nop_reduc); + } + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); /* Delete original reduction stmt. */ stmt_it = gsi_for_stmt (reduc); @@ -1808,7 +1905,7 @@ ifcvt_follow_ssa_use_edges (tree val) static void predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi) { - gimple *new_stmt = NULL, *reduc; + gimple *new_stmt = NULL, *reduc, *nop_reduc; tree rhs, res, arg0, arg1, op0, op1, scev; tree cond; unsigned int index0; @@ -1816,6 +1913,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi) edge e; basic_block bb; unsigned int i; + bool has_nop; res = gimple_phi_result (phi); if (virtual_operand_p (res)) @@ -1876,10 +1974,15 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi) arg1 = gimple_phi_arg_def (phi, 1); } if (is_cond_scalar_reduction (phi, &reduc, arg0, arg1, - &op0, &op1, false)) - /* Convert reduction stmt into vectorizable form. */ - rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1, - true_bb != gimple_bb (reduc)); + &op0, &op1, false, &has_nop, + &nop_reduc)) + { + /* Convert reduction stmt into vectorizable form. */ + rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1, + true_bb != gimple_bb (reduc), + has_nop, nop_reduc); + redundant_ssa_names.safe_push (std::make_pair (res, rhs)); + } else /* Build new RHS using selected condition and arguments. */ rhs = fold_build_cond_expr (TREE_TYPE (res), unshare_expr (cond), @@ -1961,14 +2064,17 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi) is_gimple_condexpr, NULL_TREE, true, GSI_SAME_STMT); if (!(is_cond_scalar_reduction (phi, &reduc, arg0 , arg1, - &op0, &op1, true))) + &op0, &op1, true, &has_nop, &nop_reduc))) rhs = fold_build_cond_expr (TREE_TYPE (res), unshare_expr (cond), swap? arg1 : arg0, swap? arg0 : arg1); else - /* Convert reduction stmt into vectorizable form. */ - rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1, - swap); + { + /* Convert reduction stmt into vectorizable form. */ + rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1, + swap,has_nop, nop_reduc); + redundant_ssa_names.safe_push (std::make_pair (res, rhs)); + } new_stmt = gimple_build_assign (res, rhs); gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); update_stmt (new_stmt); -- 2.18.1