This patch adds aarch64_loop_unroll_adjust to limit partial unrolling
in rtl based on strided-loads in loop.

Thanks,
Kugan

gcc/ChangeLog:

2017-09-12  Kugan Vivekanandarajah  <kug...@linaro.org>

    * cfgloop.h (iv_analyze_biv): export.
    * loop-iv.c: Likewise.
    * config/aarch64/aarch64.c (strided_load_p): New.
    (insn_has_strided_load): New.
    (count_strided_load_rtl): New.
    (aarch64_loop_unroll_adjust): New.
From 10e02b026784798fff6a3513dc11b1cffb1cf78a Mon Sep 17 00:00:00 2001
From: Kugan Vivekanandarajah <kugan.vivekanandara...@linaro.org>
Date: Wed, 23 Aug 2017 12:35:14 +1000
Subject: [PATCH 5/5] add aarch64_loop_unroll_adjust

---
 gcc/cfgloop.h                |   1 +
 gcc/config/aarch64/aarch64.c | 136 +++++++++++++++++++++++++++++++++++++++++++
 gcc/loop-iv.c                |   2 +-
 3 files changed, 138 insertions(+), 1 deletion(-)

diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index 2308e7a..a3876a2 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -479,6 +479,7 @@ extern bool iv_analyze_expr (rtx_insn *, rtx, machine_mode,
 extern rtx get_iv_value (struct rtx_iv *, rtx);
 extern bool biv_p (rtx_insn *, rtx);
 extern void find_simple_exit (struct loop *, struct niter_desc *);
+extern bool iv_analyze_biv (rtx def, struct rtx_iv *iv);
 extern void iv_analysis_done (void);
 
 extern struct niter_desc *get_simple_loop_desc (struct loop *loop);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e88bb6c..624a996 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -15189,6 +15189,139 @@ aarch64_ok_to_unroll (struct loop *loop, unsigned HOST_WIDE_INT nunroll)
   return true;
 }
 
+/* Return true if X is a strided load.  */
+
+static bool
+strided_load_p (const_rtx x)
+{
+  struct rtx_iv iv;
+  rtx reg;
+
+  if (!MEM_P (x))
+    return false;
+
+  reg = XEXP (x, 0);
+  if (REG_P (reg)
+      || UNARY_P (reg))
+    {
+      if (!REG_P (reg))
+	reg = XEXP (reg, 0);
+      if (REG_P (reg)
+	  && iv_analyze_biv (reg, &iv))
+	return true;
+    }
+  else if (BINARY_P (reg))
+    {
+      rtx reg1, reg2;
+      reg1 = XEXP (reg, 0);
+      reg2 = XEXP (reg, 1);
+      if (REG_P (reg1)
+	  && iv_analyze_biv (reg1, &iv))
+	return true;
+      if (REG_P (reg2)
+	  && iv_analyze_biv (reg2, &iv))
+	return true;
+    }
+  return false;
+}
+
+
+/* Return true if X INSN is a strided load.  */
+
+static bool
+insn_has_strided_load (rtx_insn *insn)
+{
+  subrtx_iterator::array_type array;
+  if (!INSN_P (insn) || recog_memoized (insn) < 0)
+    return false;
+  rtx pat = PATTERN (insn);
+
+  switch (GET_CODE (pat))
+    {
+    case PARALLEL:
+	{
+	  for (int j = 0; j < XVECLEN (pat, 0); ++j)
+	    {
+	      rtx ex = XVECEXP (pat, 0, j);
+	      FOR_EACH_SUBRTX (iter, array, ex, NONCONST)
+		{
+		  const_rtx x = *iter;
+		  if (GET_CODE (x) == SET
+		      && strided_load_p (SET_SRC (x)))
+		    return true;
+		}
+	    }
+	}
+      break;
+
+    case SET:
+      FOR_EACH_SUBRTX (iter, array, SET_SRC (pat), NONCONST)
+	{
+	  const_rtx x = *iter;
+	  if (strided_load_p (x))
+	    return true;
+	}
+
+    default:
+      break;
+    }
+  return false;
+}
+
+/* Count the strided loads in the LOOP. If the strided loads are larger
+   (compared to MAX_STRIDED_LOADS), we dont need to compute all of
+   them.  This is used to limit the partial  unrolling factor to avoid
+   prefetcher collision.  */
+
+static unsigned
+count_strided_load_rtl (struct loop *loop, unsigned max_strided_loads)
+{
+  basic_block *bbs;
+  unsigned count = 0;
+  rtx_insn *insn;
+  iv_analysis_loop_init (loop);
+  bbs = get_loop_body (loop);
+
+  for (unsigned i = 0; i < loop->num_nodes; ++i)
+    {
+      FOR_BB_INSNS (bbs[i], insn)
+	{
+	  if (insn_has_strided_load (insn))
+	    count ++;
+
+	  if (count > (max_strided_loads / 2))
+	    {
+	      free (bbs);
+	      iv_analysis_done ();
+	      return count;
+	    }
+	}
+    }
+  free (bbs);
+  iv_analysis_done ();
+  return count;
+}
+
+/* Target hook loop_unroll_adjust that limits partial loop unrolling
+   factor, if this would make the outer loop's prefetch streams more
+   than hardware can handle.  */
+
+static unsigned
+aarch64_loop_unroll_adjust (unsigned n_unroll, struct loop *loop)
+{
+  int max_strided_loads;
+  max_strided_loads = aarch64_tune_params.prefetch->hw_prefetchers_avail;
+
+  if (max_strided_loads == -1)
+    return n_unroll;
+
+  unsigned count = count_strided_load_rtl (loop, max_strided_loads);
+  if (count > 0)
+    n_unroll = 1 << (floor_log2 (max_strided_loads/count));
+
+  return n_unroll;
+}
+
 /* Target-specific selftests.  */
 
 #if CHECKING_P
@@ -15620,6 +15753,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_OK_TO_UNROLL
 #define TARGET_OK_TO_UNROLL aarch64_ok_to_unroll
 
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST aarch64_loop_unroll_adjust
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
diff --git a/gcc/loop-iv.c b/gcc/loop-iv.c
index 745b613..3a8c54e 100644
--- a/gcc/loop-iv.c
+++ b/gcc/loop-iv.c
@@ -852,7 +852,7 @@ record_biv (rtx def, struct rtx_iv *iv)
 /* Determines whether DEF is a biv and if so, stores its description
    to *IV.  */
 
-static bool
+bool
 iv_analyze_biv (rtx def, struct rtx_iv *iv)
 {
   rtx inner_step, outer_step;
-- 
2.7.4

Reply via email to