This patch adds aarch64_loop_unroll_adjust to limit partial unrolling in rtl based on strided-loads in loop.
Thanks, Kugan gcc/ChangeLog: 2017-09-12 Kugan Vivekanandarajah <kug...@linaro.org> * cfgloop.h (iv_analyze_biv): export. * loop-iv.c: Likewise. * config/aarch64/aarch64.c (strided_load_p): New. (insn_has_strided_load): New. (count_strided_load_rtl): New. (aarch64_loop_unroll_adjust): New.
From 10e02b026784798fff6a3513dc11b1cffb1cf78a Mon Sep 17 00:00:00 2001 From: Kugan Vivekanandarajah <kugan.vivekanandara...@linaro.org> Date: Wed, 23 Aug 2017 12:35:14 +1000 Subject: [PATCH 5/5] add aarch64_loop_unroll_adjust --- gcc/cfgloop.h | 1 + gcc/config/aarch64/aarch64.c | 136 +++++++++++++++++++++++++++++++++++++++++++ gcc/loop-iv.c | 2 +- 3 files changed, 138 insertions(+), 1 deletion(-) diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h index 2308e7a..a3876a2 100644 --- a/gcc/cfgloop.h +++ b/gcc/cfgloop.h @@ -479,6 +479,7 @@ extern bool iv_analyze_expr (rtx_insn *, rtx, machine_mode, extern rtx get_iv_value (struct rtx_iv *, rtx); extern bool biv_p (rtx_insn *, rtx); extern void find_simple_exit (struct loop *, struct niter_desc *); +extern bool iv_analyze_biv (rtx def, struct rtx_iv *iv); extern void iv_analysis_done (void); extern struct niter_desc *get_simple_loop_desc (struct loop *loop); diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index e88bb6c..624a996 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -15189,6 +15189,139 @@ aarch64_ok_to_unroll (struct loop *loop, unsigned HOST_WIDE_INT nunroll) return true; } +/* Return true if X is a strided load. */ + +static bool +strided_load_p (const_rtx x) +{ + struct rtx_iv iv; + rtx reg; + + if (!MEM_P (x)) + return false; + + reg = XEXP (x, 0); + if (REG_P (reg) + || UNARY_P (reg)) + { + if (!REG_P (reg)) + reg = XEXP (reg, 0); + if (REG_P (reg) + && iv_analyze_biv (reg, &iv)) + return true; + } + else if (BINARY_P (reg)) + { + rtx reg1, reg2; + reg1 = XEXP (reg, 0); + reg2 = XEXP (reg, 1); + if (REG_P (reg1) + && iv_analyze_biv (reg1, &iv)) + return true; + if (REG_P (reg2) + && iv_analyze_biv (reg2, &iv)) + return true; + } + return false; +} + + +/* Return true if X INSN is a strided load. */ + +static bool +insn_has_strided_load (rtx_insn *insn) +{ + subrtx_iterator::array_type array; + if (!INSN_P (insn) || recog_memoized (insn) < 0) + return false; + rtx pat = PATTERN (insn); + + switch (GET_CODE (pat)) + { + case PARALLEL: + { + for (int j = 0; j < XVECLEN (pat, 0); ++j) + { + rtx ex = XVECEXP (pat, 0, j); + FOR_EACH_SUBRTX (iter, array, ex, NONCONST) + { + const_rtx x = *iter; + if (GET_CODE (x) == SET + && strided_load_p (SET_SRC (x))) + return true; + } + } + } + break; + + case SET: + FOR_EACH_SUBRTX (iter, array, SET_SRC (pat), NONCONST) + { + const_rtx x = *iter; + if (strided_load_p (x)) + return true; + } + + default: + break; + } + return false; +} + +/* Count the strided loads in the LOOP. If the strided loads are larger + (compared to MAX_STRIDED_LOADS), we dont need to compute all of + them. This is used to limit the partial unrolling factor to avoid + prefetcher collision. */ + +static unsigned +count_strided_load_rtl (struct loop *loop, unsigned max_strided_loads) +{ + basic_block *bbs; + unsigned count = 0; + rtx_insn *insn; + iv_analysis_loop_init (loop); + bbs = get_loop_body (loop); + + for (unsigned i = 0; i < loop->num_nodes; ++i) + { + FOR_BB_INSNS (bbs[i], insn) + { + if (insn_has_strided_load (insn)) + count ++; + + if (count > (max_strided_loads / 2)) + { + free (bbs); + iv_analysis_done (); + return count; + } + } + } + free (bbs); + iv_analysis_done (); + return count; +} + +/* Target hook loop_unroll_adjust that limits partial loop unrolling + factor, if this would make the outer loop's prefetch streams more + than hardware can handle. */ + +static unsigned +aarch64_loop_unroll_adjust (unsigned n_unroll, struct loop *loop) +{ + int max_strided_loads; + max_strided_loads = aarch64_tune_params.prefetch->hw_prefetchers_avail; + + if (max_strided_loads == -1) + return n_unroll; + + unsigned count = count_strided_load_rtl (loop, max_strided_loads); + if (count > 0) + n_unroll = 1 << (floor_log2 (max_strided_loads/count)); + + return n_unroll; +} + /* Target-specific selftests. */ #if CHECKING_P @@ -15620,6 +15753,9 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_OK_TO_UNROLL #define TARGET_OK_TO_UNROLL aarch64_ok_to_unroll +#undef TARGET_LOOP_UNROLL_ADJUST +#define TARGET_LOOP_UNROLL_ADJUST aarch64_loop_unroll_adjust + #if CHECKING_P #undef TARGET_RUN_TARGET_SELFTESTS #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests diff --git a/gcc/loop-iv.c b/gcc/loop-iv.c index 745b613..3a8c54e 100644 --- a/gcc/loop-iv.c +++ b/gcc/loop-iv.c @@ -852,7 +852,7 @@ record_biv (rtx def, struct rtx_iv *iv) /* Determines whether DEF is a biv and if so, stores its description to *IV. */ -static bool +bool iv_analyze_biv (rtx def, struct rtx_iv *iv) { rtx inner_step, outer_step; -- 2.7.4