This patch prevent tree unroller from completely unrolling inner loops if that results in excessive strided-loads in outer loop.
Thanks, Kugan gcc/ChangeLog: 2017-09-12 Kugan Vivekanandarajah <kug...@linaro.org> * config/aarch64/aarch64.c (count_mem_load_streams): New. (aarch64_ok_to_unroll): New. * doc/tm.texi (ok_to_unroll): Define new target hook. * doc/tm.texi.in (ok_to_unroll): Likewise. * target.def (ok_to_unroll): Likewise. * tree-ssa-loop-ivcanon.c (try_unroll_loop_completely): Use ok_to_unroll while unrolling.
From 5de245bbf6ba1768e8206a61feb0f42c106a1d94 Mon Sep 17 00:00:00 2001 From: Kugan Vivekanandarajah <kugan.vivekanandara...@linaro.org> Date: Fri, 18 Aug 2017 16:41:13 +1000 Subject: [PATCH 3/5] tree unroller limit strided loads --- gcc/config/aarch64/aarch64.c | 70 ++++++++++++++++++++++++++++++++++++++++++++ gcc/doc/tm.texi | 4 +++ gcc/doc/tm.texi.in | 2 ++ gcc/target.def | 8 +++++ gcc/tree-ssa-loop-ivcanon.c | 8 +++++ 5 files changed, 92 insertions(+) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 7d1ee70..e88bb6c 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -64,6 +64,7 @@ #include "sched-int.h" #include "target-globals.h" #include "common/common-target.h" +#include "tree-scalar-evolution.h" #include "selftest.h" #include "selftest-rtl.h" @@ -15122,6 +15123,72 @@ aarch64_sched_can_speculate_insn (rtx_insn *insn) } } +/* Count the strided loads in the LOOP with respect to OUT_LOOP. + If the strided loads are larger (compared to MAX_STRIDED_LOADS), + we dont need to compute all of them. */ + +static unsigned +count_mem_load_streams (struct loop *out_loop, + struct loop *loop, + unsigned max_strided_loads) +{ + basic_block *bbs = get_loop_body (loop); + unsigned nbbs = loop->num_nodes; + gimple_stmt_iterator gsi; + unsigned count = 0; + + for (unsigned i = 0; i < nbbs; i++) + { + bool ok; + basic_block bb = bbs[i]; + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + if (!is_gimple_assign (stmt) + || !gimple_vuse (stmt)) + continue; + tree op = gimple_assign_rhs1 (stmt); + if (!INDIRECT_REF_P (op) + && TREE_CODE (op) != MEM_REF + && TREE_CODE (op) != TARGET_MEM_REF) + continue; + op = TREE_OPERAND (op, 0); + tree ev = analyze_scalar_evolution (out_loop, op); + ev = instantiate_parameters (loop, ev); + if (no_evolution_in_loop_p (ev, out_loop->num, &ok) && !ok) + count++; + if (count >= max_strided_loads) + return count; + } + } + return count; +} + +/* Target hook that prevents complete loop unrolling if this would make + the outer loop's prefetch strems more than hardware can handle. */ + +static bool +aarch64_ok_to_unroll (struct loop *loop, unsigned HOST_WIDE_INT nunroll) +{ + struct loop *loop_father; + unsigned loads; + unsigned outter_loads; + + if (aarch64_tune_params.prefetch->hw_prefetchers_avail == -1) + return true; + + if ((loop_father = loop_outer (loop))) + { + unsigned max_strided_loads = aarch64_tune_params.prefetch->hw_prefetchers_avail; + loads = count_mem_load_streams (loop_father, loop, max_strided_loads); + outter_loads = count_mem_load_streams (loop_father, loop_father, max_strided_loads); + if ((outter_loads + (nunroll - 1) * loads) > max_strided_loads) + return false; + } + return true; +} + /* Target-specific selftests. */ #if CHECKING_P @@ -15550,6 +15617,9 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4 +#undef TARGET_OK_TO_UNROLL +#define TARGET_OK_TO_UNROLL aarch64_ok_to_unroll + #if CHECKING_P #undef TARGET_RUN_TARGET_SELFTESTS #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 795e492..45cea4c 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -11617,6 +11617,10 @@ is required only when the target has special constraints like maximum number of memory accesses. @end deftypefn +@deftypefn {Target Hook} bool TARGET_OK_TO_UNROLL (struct loop *@var{loop_info}, unsigned HOST_WIDE_INT @var{nunroll}) +This hook should return false if target prefers loop should not be unrolled +@end deftypefn + @defmac POWI_MAX_MULTS If defined, this macro is interpreted as a signed integer C expression that specifies the maximum number of floating point multiplications diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 98f2e6b..64dfa51 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -8155,6 +8155,8 @@ build_type_attribute_variant (@var{mdecl}, @hook TARGET_LOOP_UNROLL_ADJUST +@hook TARGET_OK_TO_UNROLL + @defmac POWI_MAX_MULTS If defined, this macro is interpreted as a signed integer C expression that specifies the maximum number of floating point multiplications diff --git a/gcc/target.def b/gcc/target.def index bbd9c01..2f62328 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -5120,6 +5120,14 @@ hardware divmod insn but defines target-specific divmod libfuncs.", void, (rtx libfunc, machine_mode mode, rtx op0, rtx op1, rtx *quot, rtx *rem), NULL) +/* Target function to check complete unrolling of loop is profitable for loop. */ +DEFHOOK +(ok_to_unroll, + "This hook should return false if target prefers loop should not be unrolled", + bool, + (struct loop *loop_info, unsigned HOST_WIDE_INT nunroll), + NULL) + /* Return the class for a secondary reload, and fill in extra information. */ DEFHOOK (secondary_reload, diff --git a/gcc/tree-ssa-loop-ivcanon.c b/gcc/tree-ssa-loop-ivcanon.c index efb199a..c2016458 100644 --- a/gcc/tree-ssa-loop-ivcanon.c +++ b/gcc/tree-ssa-loop-ivcanon.c @@ -63,6 +63,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-inline.h" #include "tree-cfgcleanup.h" #include "builtins.h" +#include "target.h" /* Specifies types of loops that may be unrolled. */ @@ -855,6 +856,13 @@ try_unroll_loop_completely (struct loop *loop, loop->num); return false; } + + if (targetm.ok_to_unroll + && !targetm.ok_to_unroll (loop, n_unroll)) + { + return false; + } + if (!n_unroll) dump_printf_loc (report_flags, locus, "loop turned into non-loop; it never loops.\n"); -- 2.7.4