On Wed, 16 Mar 2022 at 08:15, Richard Biener <richard.guent...@gmail.com> wrote:
> The canonical place to transform loops into builtins would be loop > distribution. > Another place would be final value replacement since you basically replace > the reduction result with a call to the builtin, but I think > loop-distribution is > the better overall place. See how we match strlen() there. > > Richard. So I suppose that would be something along the line of the below patch? Except it'd need a lot more checks to make sure it's actually processing a CRC computation, and there needs to be code to actually expand the builtin using optabs. And something needs to be done to make match.pd work on the output. I'm seeing crcu16 being unrolled by in the *.cunroll dump and the remains of the loops deleted in the *.dse4 dump, but the patch.pd clause to simplify the two __builtin_crc8s calls never matches, so we end up with this in *.optimized: ee_u16 crcu16 (ee_u16 newval, ee_u16 crc) { unsigned char _1; short unsigned int _2; unsigned char _3; short unsigned int _24; short unsigned int _26; <bb 2> [local count: 1073741824]: _1 = (unsigned char) newval_4(D); _24 = __builtin_crc8s (crc_6(D), _1, 40961); _2 = newval_4(D) >> 8; _3 = (unsigned char) _2; _26 = __builtin_crc8s (_24, _3, 40961); [tail call] return _26; }
diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc index db6e9096a86..b74e8569b94 100644 --- a/gcc/tree-loop-distribution.cc +++ b/gcc/tree-loop-distribution.cc @@ -659,6 +659,9 @@ class loop_distribution replace them accordingly. */ bool transform_reduction_loop (loop_p loop); + /* Transform some loops which calculate a CRC. */ + bool transform_reduction_loop (loop_p loop, tree niters); + /* Compute topological order for basic blocks. Topological order is needed because data dependence is computed for data references in lexicographical order. */ @@ -3432,6 +3435,26 @@ generate_strlen_builtin_using_rawmemchr (loop_p loop, tree reduction_var, start_len); } +static void +generate_crc_builtin (loop_p loop, tree reduction_var, + tree crc_in, tree data_in, tree xor_val, + location_t loc) +{ + gimple_seq seq = NULL; + tree reduction_var_new = make_ssa_name (TREE_TYPE (reduction_var)); + + crc_in = force_gimple_operand (crc_in, &seq, true, NULL_TREE); + data_in = force_gimple_operand (data_in, &seq, true, NULL_TREE); + tree fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_CRC8S)); + gimple *fn_call = gimple_build_call (fn, 3, crc_in, data_in, xor_val); + gimple_call_set_lhs (fn_call, reduction_var_new); + gimple_set_location (fn_call, loc); + gimple_seq_add_stmt (&seq, fn_call); + + generate_reduction_builtin_1 (loop, seq, reduction_var, reduction_var_new, + "generated crc%s", E_QImode); +} + /* Return true if we can count at least as many characters by taking pointer difference as we can count via reduction_var without an overflow. Thus compute 2^n < (2^(m-1) / s) where n = TYPE_PRECISION (reduction_var_type), @@ -3713,6 +3736,128 @@ loop_distribution::transform_reduction_loop (loop_p loop) return false; } +/* Match loops like: + + <bb 3> [local count: 954449105]: + # data_16 = PHI <data_13(7), data_9(D)(2)> + # crc_22 = PHI <crc_4(7), crc_10(D)(2)> + # i_3 = PHI <i_18(7), 0(2)> + # ivtmp_7 = PHI <ivtmp_6(7), 8(2)> + _8 = (unsigned char) crc_22; + _1 = _8 ^ data_16; + x16_12 = _1 & 1; + data_13 = data_16 >> 1; + _19 = crc_22 >> 1; + if (x16_12 != 0) + goto <bb 4>; [50.00%] + else + goto <bb 8>; [50.00%] + + <bb 8> [local count: 477224553]: + goto <bb 5>; [100.00%] + + <bb 4> [local count: 477224552]: + crc_17 = _19 ^ 40961; + + <bb 5> [local count: 954449105]: + # crc_4 = PHI <crc_17(4), _19(8)> + i_18 = i_3 + 1; + ivtmp_6 = ivtmp_7 - 1; + if (ivtmp_6 != 0) + goto <bb 7>; [88.89%] + else + goto <bb 6>; [11.11%] + + <bb 7> [local count: 848409806]: + goto <bb 3>; [100.00%] */ + +bool +loop_distribution::transform_reduction_loop (loop_p loop, tree niters) +{ + gimple *reduction_stmt; + + if (!wi::eq_p (wi::to_widest (niters), 7)) + return false; + + if (loop->num_nodes != 5) + return false; + + reduction_stmt = determine_reduction_stmt (loop); + if (reduction_stmt == NULL) + return false; + + /* Reduction variables are guaranteed to be SSA names. */ + tree reduction_var; + switch (gimple_code (reduction_stmt)) + { + case GIMPLE_ASSIGN: + case GIMPLE_PHI: + reduction_var = gimple_get_lhs (reduction_stmt); + break; + default: + /* Bail out e.g. for GIMPLE_CALL. */ + return false; + } + + if (EDGE_COUNT (loop->header->preds) != 2) + return false; + + edge e, entry_edge = NULL, backedge = NULL; + edge_iterator ei; + + FOR_EACH_EDGE (e, ei, loop->header->preds) + if (e->src->loop_father != loop) + entry_edge = e; + else + backedge = e; + + if (!entry_edge || !backedge) + return false; + + tree crc_in = NULL_TREE, data_in = NULL_TREE; + + for (gphi_iterator gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gphi *phi = gsi.phi (); + tree val = PHI_ARG_DEF_FROM_EDGE (phi, entry_edge); + if (PHI_ARG_DEF_FROM_EDGE (phi, backedge) == reduction_var) + crc_in = val; + else if (!data_in) + data_in = val; + } + if (!crc_in || !data_in) + return false; + + basic_block xor_bb = NULL; + gcond *cond = safe_dyn_cast <gcond *> (last_stmt (loop->header)); + if (!cond) + return false; + + FOR_EACH_EDGE (e, ei, loop->header->succs) + if ((e->flags & EDGE_TRUE_VALUE && gimple_cond_code (cond) == NE_EXPR) + || (e->flags & EDGE_FALSE_VALUE && gimple_cond_code (cond) == EQ_EXPR)) + xor_bb = e->dest; + + if (!xor_bb) + return false; + + gimple_stmt_iterator gsi = gsi_start_nondebug_bb (xor_bb); + if (gsi_end_p (gsi)) + return false; + gimple *xor_stmt = gsi_stmt (gsi); + if (!is_gimple_assign (xor_stmt) || gimple_assign_rhs_code(xor_stmt) != BIT_XOR_EXPR) + return false; + tree xor_val = gimple_assign_rhs2 (xor_stmt); + gsi_next_nondebug (&gsi); + if (!gsi_end_p (gsi)) + return false; + + generate_crc_builtin (loop, reduction_var, crc_in, data_in, xor_val, + gimple_location (xor_stmt)); + return true; +} + /* Given innermost LOOP, return the outermost enclosing loop that forms a perfect loop nest. */ @@ -3798,6 +3943,11 @@ loop_distribution::execute (function *fun) free_data_refs (datarefs_vec); continue; } + else if (TREE_CODE (niters) == INTEGER_CST && flag_tree_loop_distribute_patterns) + { + if (transform_reduction_loop (loop, niters)) + continue; + } /* Get the perfect loop nest for distribution. */ loop = prepare_perfect_loop_nest (loop);