...well, for core instructions at least.  NEON is a separate patch.

I haven't measured any benefit or regression from this change on its own.
It makes a difference with the new auto-inc-dec pass though.
I diffed the "before" and "after" assembly code for libav to get a sense
for whether the patch was having the desired effect.  It seems to be,
though the circumstances that lead to the obvious cases can be a bit
unfortunate.  For example, A8 schedules like:

        ldr     r5, [r2], #4
        ldr     r4, [sp, #128]
        cmp     r4, r2

are now scheduled as:

        ldr     r4, [sp, #128]
        ldr     r5, [r2], #4
        cmp     r4, r2

as hoped.  Tested on arm-linux-gnueabi.  OK to install?

Richard


gcc/
        * config/arm/arm-protos.h (arm_writeback_dep): Declare.
        (arm_writeback_only_dep): Likewise.
        * config/arm/arm.c (arm_writeback_dep): New function.
        (arm_writeback_only_dep_1, arm_writeback_only_dep): Likewise.
        * config/arm/cortex-a8.md: Add address-writeback bypasses for
        loads and stores.
        * config/arm/cortex-a9.md: Likewise.

Index: gcc/config/arm/arm-protos.h
===================================================================
--- gcc/config/arm/arm-protos.h 2011-08-17 16:03:05.000000000 +0100
+++ gcc/config/arm/arm-protos.h 2011-08-18 15:03:13.424542107 +0100
@@ -99,6 +99,8 @@ extern int arm_no_early_alu_shift_dep (r
 extern int arm_no_early_alu_shift_value_dep (rtx, rtx);
 extern int arm_no_early_mul_dep (rtx, rtx);
 extern int arm_mac_accumulator_is_mul_result (rtx, rtx);
+extern int arm_writeback_dep (rtx, rtx);
+extern int arm_writeback_only_dep (rtx, rtx);
 
 extern int tls_mentioned_p (rtx);
 extern int symbol_mentioned_p (rtx);
Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c        2011-08-18 14:47:14.146516649 +0100
+++ gcc/config/arm/arm.c        2011-08-18 15:03:13.450542049 +0100
@@ -22689,6 +22689,50 @@ arm_mac_accumulator_is_mul_result (rtx p
           && !reg_overlap_mentioned_p (mul_result, mac_op1));
 }
 
+/* Return true if there is an address register writeback dependency
+   between PRODUCER and CONSUMER.  */
+
+int
+arm_writeback_dep (rtx producer, rtx consumer)
+{
+  rtx note;
+
+  for (note = REG_NOTES (producer); note; note = XEXP (note, 1))
+    if (REG_NOTE_KIND (note) == REG_INC
+       && reg_referenced_p (XEXP (note, 0), PATTERN (consumer)))
+      return true;
+  return false;
+}
+
+/* A note_stores callback for which DATA is an rtx *.  If DEST is set
+   by a SET pattern, and if *DATA is nonnull, check whether instruction
+   *DATA references DEST.  Clear *DATA if so.  */
+
+static void
+arm_writeback_only_dep_1 (rtx dest, const_rtx container, void *data)
+{
+  rtx *consumer;
+
+  consumer = (rtx *) data;
+  if (GET_CODE (container) == SET
+      && *consumer
+      && reg_referenced_p (dest, PATTERN (*consumer)))
+    *consumer = 0;
+}
+
+/* Return true if the only true dependence between PRODUCER and CONSUMER
+   is an address register writeback.  */
+
+int
+arm_writeback_only_dep (rtx producer, rtx consumer)
+{
+  if (arm_writeback_dep (producer, consumer))
+    {
+      note_stores (PATTERN (producer), arm_writeback_only_dep_1, &consumer);
+      return consumer != NULL_RTX;
+    }
+  return 0;
+}
 
 /* The EABI says test the least significant bit of a guard variable.  */
 
Index: gcc/config/arm/cortex-a8.md
===================================================================
--- gcc/config/arm/cortex-a8.md 2011-08-12 08:51:44.400598496 +0100
+++ gcc/config/arm/cortex-a8.md 2011-08-18 15:03:35.516496568 +0100
@@ -186,8 +186,22 @@ (define_bypass 4 "cortex_a8_mul,cortex_a
                "cortex_a8_alu_shift_reg"
                "arm_no_early_alu_shift_value_dep")
 
+
+;; Load address register writeback
+
+;; Address register writeback has a latency of 2 instructions, or 1 if
+;; there is no early dependency.  Don't bother handling early shift
+;; dependencies for address writeback; it's very unlikely that an
+;; address will be used that way in critical code.
+(define_bypass 1 "cortex_a8_load*"
+              "cortex_a8_alu*,cortex_a8_mov"
+              "arm_writeback_only_dep")
+(define_bypass 2 "cortex_a8_load*"
+              "cortex_a8_*mul*,cortex_a8_*mla*,cortex_a8_load*,
+               cortex_a8_store*,cortex_a8_branch,cortex_a8_call"
+              "arm_writeback_only_dep")
+
 ;; Load instructions.
-;; The presence of any register writeback is ignored here.
 
 ;; A load result has latency 3 unless the dependent instruction has
 ;; no early dep, in which case it is only latency two.
@@ -229,8 +243,18 @@ (define_bypass 4 "cortex_a8_load3_4"
                "cortex_a8_alu_shift_reg"
                "arm_no_early_alu_shift_value_dep")
 
+;; Store address register writeback
+
+;; See comment for load address writeback above.
+(define_bypass 1 "cortex_a8_store*"
+              "cortex_a8_alu*,cortex_a8_mov"
+              "arm_writeback_dep")
+(define_bypass 2 "cortex_a8_store*"
+              "cortex_a8_*mul*,cortex_a8_*mla*,cortex_a8_load*,
+               cortex_a8_store*,cortex_a8_branch,cortex_a8_call"
+              "arm_writeback_dep")
+
 ;; Store instructions.
-;; Writeback is again ignored.
 
 (define_insn_reservation "cortex_a8_store1_2" 0
   (and (eq_attr "tune" "cortexa8")
Index: gcc/config/arm/cortex-a9.md
===================================================================
--- gcc/config/arm/cortex-a9.md 2011-08-12 08:51:44.404598488 +0100
+++ gcc/config/arm/cortex-a9.md 2011-08-18 15:03:13.520541920 +0100
@@ -163,6 +163,15 @@ (define_bypass 2 "cortex_a9_dp_shift"
  cortex_a9_load1_2, cortex_a9_dp, cortex_a9_store1_2,
  cortex_a9_mult16, cortex_a9_mac16, cortex_a9_mac, cortex_a9_store3_4, 
cortex_a9_load3_4")
 
+;; Address register writeback has a latency of 1 instruction.
+
+(define_bypass 1 "cortex_a9_load*"
+                "cortex_a9_*"
+                "arm_writeback_only_dep")
+(define_bypass 1 "cortex_a9_store*"
+                "cortex_a9_*"
+                "arm_writeback_dep")
+
 ;; An instruction in the load store pipeline can provide
 ;; read access to a DP instruction in the P0 default pipeline
 ;; before the writeback stage.

Reply via email to