...well, for core instructions at least. NEON is a separate patch. I haven't measured any benefit or regression from this change on its own. It makes a difference with the new auto-inc-dec pass though.
I diffed the "before" and "after" assembly code for libav to get a sense for whether the patch was having the desired effect. It seems to be, though the circumstances that lead to the obvious cases can be a bit unfortunate. For example, A8 schedules like: ldr r5, [r2], #4 ldr r4, [sp, #128] cmp r4, r2 are now scheduled as: ldr r4, [sp, #128] ldr r5, [r2], #4 cmp r4, r2 as hoped. Tested on arm-linux-gnueabi. OK to install? Richard gcc/ * config/arm/arm-protos.h (arm_writeback_dep): Declare. (arm_writeback_only_dep): Likewise. * config/arm/arm.c (arm_writeback_dep): New function. (arm_writeback_only_dep_1, arm_writeback_only_dep): Likewise. * config/arm/cortex-a8.md: Add address-writeback bypasses for loads and stores. * config/arm/cortex-a9.md: Likewise. Index: gcc/config/arm/arm-protos.h =================================================================== --- gcc/config/arm/arm-protos.h 2011-08-17 16:03:05.000000000 +0100 +++ gcc/config/arm/arm-protos.h 2011-08-18 15:03:13.424542107 +0100 @@ -99,6 +99,8 @@ extern int arm_no_early_alu_shift_dep (r extern int arm_no_early_alu_shift_value_dep (rtx, rtx); extern int arm_no_early_mul_dep (rtx, rtx); extern int arm_mac_accumulator_is_mul_result (rtx, rtx); +extern int arm_writeback_dep (rtx, rtx); +extern int arm_writeback_only_dep (rtx, rtx); extern int tls_mentioned_p (rtx); extern int symbol_mentioned_p (rtx); Index: gcc/config/arm/arm.c =================================================================== --- gcc/config/arm/arm.c 2011-08-18 14:47:14.146516649 +0100 +++ gcc/config/arm/arm.c 2011-08-18 15:03:13.450542049 +0100 @@ -22689,6 +22689,50 @@ arm_mac_accumulator_is_mul_result (rtx p && !reg_overlap_mentioned_p (mul_result, mac_op1)); } +/* Return true if there is an address register writeback dependency + between PRODUCER and CONSUMER. */ + +int +arm_writeback_dep (rtx producer, rtx consumer) +{ + rtx note; + + for (note = REG_NOTES (producer); note; note = XEXP (note, 1)) + if (REG_NOTE_KIND (note) == REG_INC + && reg_referenced_p (XEXP (note, 0), PATTERN (consumer))) + return true; + return false; +} + +/* A note_stores callback for which DATA is an rtx *. If DEST is set + by a SET pattern, and if *DATA is nonnull, check whether instruction + *DATA references DEST. Clear *DATA if so. */ + +static void +arm_writeback_only_dep_1 (rtx dest, const_rtx container, void *data) +{ + rtx *consumer; + + consumer = (rtx *) data; + if (GET_CODE (container) == SET + && *consumer + && reg_referenced_p (dest, PATTERN (*consumer))) + *consumer = 0; +} + +/* Return true if the only true dependence between PRODUCER and CONSUMER + is an address register writeback. */ + +int +arm_writeback_only_dep (rtx producer, rtx consumer) +{ + if (arm_writeback_dep (producer, consumer)) + { + note_stores (PATTERN (producer), arm_writeback_only_dep_1, &consumer); + return consumer != NULL_RTX; + } + return 0; +} /* The EABI says test the least significant bit of a guard variable. */ Index: gcc/config/arm/cortex-a8.md =================================================================== --- gcc/config/arm/cortex-a8.md 2011-08-12 08:51:44.400598496 +0100 +++ gcc/config/arm/cortex-a8.md 2011-08-18 15:03:35.516496568 +0100 @@ -186,8 +186,22 @@ (define_bypass 4 "cortex_a8_mul,cortex_a "cortex_a8_alu_shift_reg" "arm_no_early_alu_shift_value_dep") + +;; Load address register writeback + +;; Address register writeback has a latency of 2 instructions, or 1 if +;; there is no early dependency. Don't bother handling early shift +;; dependencies for address writeback; it's very unlikely that an +;; address will be used that way in critical code. +(define_bypass 1 "cortex_a8_load*" + "cortex_a8_alu*,cortex_a8_mov" + "arm_writeback_only_dep") +(define_bypass 2 "cortex_a8_load*" + "cortex_a8_*mul*,cortex_a8_*mla*,cortex_a8_load*, + cortex_a8_store*,cortex_a8_branch,cortex_a8_call" + "arm_writeback_only_dep") + ;; Load instructions. -;; The presence of any register writeback is ignored here. ;; A load result has latency 3 unless the dependent instruction has ;; no early dep, in which case it is only latency two. @@ -229,8 +243,18 @@ (define_bypass 4 "cortex_a8_load3_4" "cortex_a8_alu_shift_reg" "arm_no_early_alu_shift_value_dep") +;; Store address register writeback + +;; See comment for load address writeback above. +(define_bypass 1 "cortex_a8_store*" + "cortex_a8_alu*,cortex_a8_mov" + "arm_writeback_dep") +(define_bypass 2 "cortex_a8_store*" + "cortex_a8_*mul*,cortex_a8_*mla*,cortex_a8_load*, + cortex_a8_store*,cortex_a8_branch,cortex_a8_call" + "arm_writeback_dep") + ;; Store instructions. -;; Writeback is again ignored. (define_insn_reservation "cortex_a8_store1_2" 0 (and (eq_attr "tune" "cortexa8") Index: gcc/config/arm/cortex-a9.md =================================================================== --- gcc/config/arm/cortex-a9.md 2011-08-12 08:51:44.404598488 +0100 +++ gcc/config/arm/cortex-a9.md 2011-08-18 15:03:13.520541920 +0100 @@ -163,6 +163,15 @@ (define_bypass 2 "cortex_a9_dp_shift" cortex_a9_load1_2, cortex_a9_dp, cortex_a9_store1_2, cortex_a9_mult16, cortex_a9_mac16, cortex_a9_mac, cortex_a9_store3_4, cortex_a9_load3_4") +;; Address register writeback has a latency of 1 instruction. + +(define_bypass 1 "cortex_a9_load*" + "cortex_a9_*" + "arm_writeback_only_dep") +(define_bypass 1 "cortex_a9_store*" + "cortex_a9_*" + "arm_writeback_dep") + ;; An instruction in the load store pipeline can provide ;; read access to a DP instruction in the P0 default pipeline ;; before the writeback stage.