On Tue, Nov 9, 2021 at 3:09 AM liuhongt <hongtao....@intel.com> wrote: > > This will enable transformation like > > - # sum1_50 = PHI <prephitmp_64(13), 0(4)> > - # sum2_52 = PHI <sum2_21(13), 0(4)> > + # sum1_50 = PHI <_87(13), 0(4)> > + # sum2_52 = PHI <_89(13), 0(4)> > # ivtmp_62 = PHI <ivtmp_61(13), 64(4)> > i.2_7 = (long unsigned int) i_49; > _8 = i.2_7 * 8; > ... > vec1_i_38 = vec1_29 >> _10; > vec2_i_39 = vec2_31 >> _10; > _11 = vec1_i_38 & 1; > - _63 = tmp_37 ^ sum1_50; > - prephitmp_64 = _11 == 0 ? sum1_50 : _63; > + _ifc__86 = _11 != 0 ? tmp_37 : 0; > + _87 = sum1_50 ^ _ifc__86; > _12 = vec2_i_39 & 1; > : > > so that vectorizer won't failed due to > > /* If this isn't a nested cycle or if the nested cycle reduction value > is used ouside of the inner loop we cannot handle uses of the reduction > value. */ > if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "reduction used in loop.\n"); > return NULL; > } > > Bootstrap and regtest on x86_64-pc-linux-gnu{-m32,} > Ok for trunk? > > gcc/ChangeLog: > > PR tree-optimization/103126 > * tree-if-conv.c (is_cond_scalar_reduction): Handle > BIT_XOR_EXPR/BIT_IOR_EXPR/BIT_AND_EXPR. > (convert_scalar_cond_reduction): Ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/ifcvt-reduction-logic-op.c: New test. > --- > .../i386/ifcvt-reduction-logic-op.c | 80 +++++++++++++++++++ > gcc/tree-if-conv.c | 19 +++-- > 2 files changed, 92 insertions(+), 7 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c > > diff --git a/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c > b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c > new file mode 100644 > index 00000000000..eeb822d5d43 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c > @@ -0,0 +1,80 @@ > +/* PR tree-optimization/103126. */ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -mavx2 -ftree-vectorize -fdump-tree-vect-details" } > */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 3 "vect" } } > */ > +#include<stdint.h> > + > +void xor_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, > uint64_t *__restrict ans, > + int64_t n) > +{ > + int64_t i; > + uint64_t vec1, sum1; > + uint64_t vec2, sum2; > + > + while (n > 0) { > + sum1 = 0; > + vec1 = a[n]; > + sum2 = 0; > + vec2 = b[n]; > + > + for (i = 0; i < 64; i++) { > + uint64_t tmp = mat[i]; > + uint64_t vec1_i = (vec1 >> i); > + uint64_t vec2_i = (vec2 >> i); > + sum1 ^= (vec1_i & 1) ? tmp : 0; > + if (vec2_i&1) sum2 ^= tmp; > + } > + *ans++ ^= sum1; n--; > + *ans++ ^= sum2; n--; > + } > +} > + > +void ior_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, > uint64_t *__restrict ans, > + int64_t n) > +{ > + int64_t i; > + uint64_t vec1, sum1; > + uint64_t vec2, sum2; > + > + while (n > 0) { > + sum1 = 0; > + vec1 = a[n]; > + sum2 = 0; > + vec2 = b[n]; > + > + for (i = 0; i < 64; i++) { > + uint64_t tmp = mat[i]; > + uint64_t vec1_i = (vec1 >> i); > + uint64_t vec2_i = (vec2 >> i); > + sum1 |= (vec1_i & 1) ? tmp : 0; > + if (vec2_i&1) sum2 |= tmp; > + } > + *ans++ |= sum1; n--; > + *ans++ |= sum2; n--; > + } > +} > + > +void and_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, > uint64_t *__restrict ans, > + int64_t n) > +{ > + int64_t i; > + uint64_t vec1, sum1; > + uint64_t vec2, sum2; > + > + while (n > 0) { > + sum1 = -1; > + vec1 = a[n]; > + sum2 = 0; > + vec2 = b[n]; > + > + for (i = 0; i < 64; i++) { > + uint64_t tmp = mat[i]; > + uint64_t vec1_i = (vec1 >> i); > + uint64_t vec2_i = (vec2 >> i); > + sum1 &= (vec1_i & 1) ? tmp : -1; > + if (vec2_i&1) sum2 &= tmp; > + } > + *ans++ &= sum1; n--; > + *ans++ &= sum2; n--; > + } > +} > diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c > index b165dc0c17f..7df1103ff89 100644 > --- a/gcc/tree-if-conv.c > +++ b/gcc/tree-if-conv.c > @@ -1732,7 +1732,9 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, > tree arg_0, tree arg_1, > reduction_op = gimple_assign_rhs_code (stmt); > } > > - if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR) > + if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR > + && reduction_op != BIT_IOR_EXPR && reduction_op != BIT_XOR_EXPR > + && reduction_op != BIT_AND_EXPR)
Please put each && on a separate line > return false; > r_op1 = gimple_assign_rhs1 (stmt); > r_op2 = gimple_assign_rhs2 (stmt); > @@ -1742,7 +1744,7 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, > tree arg_0, tree arg_1, > > /* Make R_OP1 to hold reduction variable. */ > if (r_nop2 == PHI_RESULT (header_phi) > - && reduction_op == PLUS_EXPR) > + && commutative_tree_code (reduction_op)) > { > std::swap (r_op1, r_op2); > std::swap (r_nop1, r_nop2); > @@ -1811,7 +1813,10 @@ convert_scalar_cond_reduction (gimple *reduc, > gimple_stmt_iterator *gsi, > tree rhs1 = gimple_assign_rhs1 (reduc); > tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_"); > tree c; > - tree zero = build_zero_cst (TREE_TYPE (rhs1)); > + enum tree_code reduction_op = gimple_assign_rhs_code (reduc); > + tree op_nochange = reduction_op != BIT_AND_EXPR > + ? build_zero_cst (TREE_TYPE (rhs1)) > + : build_minus_one_cst (TREE_TYPE (rhs1)); maybe export neutral_op_for_reduction and use it here (supply NULL initial_value)? Otherwise looks OK. Thanks, Richard. > gimple_seq stmts = NULL; > > if (dump_file && (dump_flags & TDF_DETAILS)) > @@ -1824,14 +1829,14 @@ convert_scalar_cond_reduction (gimple *reduc, > gimple_stmt_iterator *gsi, > of reduction rhs. */ > c = fold_build_cond_expr (TREE_TYPE (rhs1), > unshare_expr (cond), > - swap ? zero : op1, > - swap ? op1 : zero); > + swap ? op_nochange : op1, > + swap ? op1 : op_nochange); > > /* Create assignment stmt and insert it at GSI. */ > new_assign = gimple_build_assign (tmp, c); > gsi_insert_before (gsi, new_assign, GSI_SAME_STMT); > - /* Build rhs for unconditional increment/decrement. */ > - rhs = gimple_build (&stmts, gimple_assign_rhs_code (reduc), > + /* Build rhs for unconditional increment/decrement/logic_operation. */ > + rhs = gimple_build (&stmts, reduction_op, > TREE_TYPE (rhs1), op0, tmp); > > if (has_nop) > -- > 2.18.1 >