This is an addendum to the -msplit-ldst patch.
When -msplit-ldst is on, it may be possible to propagate __zero_reg__
to the sources of the new stores. For example, without this patch,
unsigned long lx;
void store_lsr17 (void)
{
lx >>= 17;
}
compiles to:
store_lsr17:
lds r26,lx+2 ; movqi_insn
lds r27,lx+3 ; movqi_insn
movw r24,r26 ; *movhi
lsr r25 ; *lshrhi3_const
ror r24
ldi r26,0 ; movqi_insn
ldi r27,0 ; movqi_insn
sts lx,r24 ; movqi_insn
sts lx+1,r25 ; movqi_insn
sts lx+2,r26 ; movqi_insn
sts lx+3,r27 ; movqi_insn
ret
but with this patch it becomes:
store_lsr17:
lds r26,lx+2 ; movqi_insn
lds r27,lx+3 ; movqi_insn
movw r24,r26 ; *movhi
lsr r25 ; *lshrhi3_const
ror r24
sts lx,r24 ; movqi_insn
sts lx+1,r25 ; movqi_insn
sts lx+2,__zero_reg__ ; movqi_insn
sts lx+3,__zero_reg__ ; movqi_insn
ret
Ok for trunk?
Johann
--
AVR: target/107957 - Propagate zero_reg to store sources.
When -msplit-ldst is on, it may be possible to propagate __zero_reg__
to the sources of the new stores. For example, without this patch,
unsigned long lx;
void store_lsr17 (void)
{
lx >>= 17;
}
compiles to:
store_lsr17:
lds r26,lx+2 ; movqi_insn
lds r27,lx+3 ; movqi_insn
movw r24,r26 ; *movhi
lsr r25 ; *lshrhi3_const
ror r24
ldi r26,0 ; movqi_insn
ldi r27,0 ; movqi_insn
sts lx,r24 ; movqi_insn
sts lx+1,r25 ; movqi_insn
sts lx+2,r26 ; movqi_insn
sts lx+3,r27 ; movqi_insn
ret
but with this patch it becomes:
store_lsr17:
lds r26,lx+2 ; movqi_insn
lds r27,lx+3 ; movqi_insn
movw r24,r26 ; *movhi
lsr r25 ; *lshrhi3_const
ror r24
sts lx,r24 ; movqi_insn
sts lx+1,r25 ; movqi_insn
sts lx+2,__zero_reg__ ; movqi_insn
sts lx+3,__zero_reg__ ; movqi_insn
ret
gcc/
PR target/107957
* config/avr/avr-passes-fuse-move.h (bbinfo_t) <try_mem0_p>:
Add static property.
* config/avr/avr-passes.cc (bbinfo_t::try_mem0_p): Define it.
(optimize_data_t::try_mem0): New method.
(bbinfo_t::optimize_one_block) [bbinfo_t::try_mem0_p]: Run try_mem0.
(bbinfo_t::optimize_one_function): Set bbinfo_t::try_mem0_p.
* config/avr/avr.md (pushhi1_insn): Also allow zero as source.
(define_split) [avropt_split_ldst]: Only run avr_split_ldst()
when avr-fuse-move has been run at least once.
* doc/invoke.texi (AVR Options) <-msplit-ldst>: Document it.
diff --git a/gcc/config/avr/avr-passes-fuse-move.h b/gcc/config/avr/avr-passes-fuse-move.h
index dbed1a636f3..432f9ca4670 100644
--- a/gcc/config/avr/avr-passes-fuse-move.h
+++ b/gcc/config/avr/avr-passes-fuse-move.h
@@ -1172,6 +1172,7 @@ struct bbinfo_t
static find_plies_data_t *fpd;
static bool try_fuse_p;
+ static bool try_mem0_p;
static bool try_bin_arg1_p;
static bool try_simplify_p;
static bool try_split_ldi_p;
diff --git a/gcc/config/avr/avr-passes.cc b/gcc/config/avr/avr-passes.cc
index de8de1cd2e8..fad64b1b345 100644
--- a/gcc/config/avr/avr-passes.cc
+++ b/gcc/config/avr/avr-passes.cc
@@ -434,6 +434,11 @@ static machine_mode size_to_mode (int size)
Split all insns where the operation can be performed on individual
bytes, like andsi3. In example (4) the andhi3 can be optimized
to an andqi3.
+
+ bbinfo_t::try_mem0_p
+ Try to fuse a mem = reg insn to mem = __zero_reg__.
+ This should only occur when -msplit-ldst is on, but may
+ also occur with pushes since push<mode>1 splits them.
*/
@@ -514,6 +519,7 @@ bool bbinfo_t::try_split_any_p;
bool bbinfo_t::try_simplify_p;
bool bbinfo_t::use_arith_p;
bool bbinfo_t::use_set_some_p;
+bool bbinfo_t::try_mem0_p;
// Abstract Interpretation of expressions.
@@ -1087,6 +1093,7 @@ struct optimize_data_t
{}
bool try_fuse (bbinfo_t *);
+ bool try_mem0 (bbinfo_t *);
bool try_bin_arg1 (bbinfo_t *);
bool try_simplify (bbinfo_t *);
bool try_split_ldi (bbinfo_t *);
@@ -2509,6 +2516,44 @@ bbinfo_t::run_find_plies (const insninfo_t &ii, const memento_t &memo) const
}
+// Try to propagate __zero_reg__ to a mem = reg insn's source.
+// Returns true on success and sets .n_new_insns.
+bool
+optimize_data_t::try_mem0 (bbinfo_t *)
+{
+ rtx_insn *insn = curr.ii.m_insn;
+ rtx set, mem, reg;
+ machine_mode mode;
+
+ if (insn
+ && (set = single_set (insn))
+ && MEM_P (mem = SET_DEST (set))
+ && REG_P (reg = SET_SRC (set))
+ && GET_MODE_SIZE (mode = GET_MODE (mem)) <= 4
+ && END_REGNO (reg) <= REG_32
+ && ! (regmask (reg) & memento_t::fixed_regs_mask)
+ && curr.regs.have_value (REGNO (reg), GET_MODE_SIZE (mode), 0x0))
+ {
+ avr_dump (";; Found insn %d: mem:%m = 0 = r%d\n", INSN_UID (insn),
+ mode, REGNO (reg));
+
+ // Some insns like PUSHes don't clobber REG_CC.
+ bool clobbers_cc = GET_CODE (PATTERN (insn)) == PARALLEL;
+
+ if (clobbers_cc)
+ emit_valid_move_clobbercc (mem, CONST0_RTX (mode));
+ else
+ emit_valid_insn (gen_rtx_SET (mem, CONST0_RTX (mode)));
+
+ n_new_insns = 1;
+
+ return true;
+ }
+
+ return false;
+}
+
+
// Try to fuse two 1-byte insns .prev and .curr to one 2-byte insn (MOVW).
// Returns true on success, and sets .n_new_insns, .ignore_mask etc.
bool
@@ -3108,7 +3153,8 @@ bbinfo_t::optimize_one_block (bool &changed)
|| (bbinfo_t::try_bin_arg1_p && od.try_bin_arg1 (this))
|| (bbinfo_t::try_simplify_p && od.try_simplify (this))
|| (bbinfo_t::try_split_ldi_p && od.try_split_ldi (this))
- || (bbinfo_t::try_split_any_p && od.try_split_any (this)));
+ || (bbinfo_t::try_split_any_p && od.try_split_any (this))
+ || (bbinfo_t::try_mem0_p && od.try_mem0 (this)));
rtx_insn *new_insns = get_insns ();
end_sequence ();
@@ -3193,6 +3239,7 @@ bbinfo_t::optimize_one_function (function *func)
// Which optimization(s) to perform.
bbinfo_t::try_fuse_p = avropt_fuse_move & 0x1; // Digit 0 in [0, 1].
+ bbinfo_t::try_mem0_p = avropt_fuse_move & 0x1; // Digit 0 in [0, 1].
bbinfo_t::try_bin_arg1_p = avropt_fuse_move & 0x2; // Digit 1 in [0, 1].
bbinfo_t::try_split_any_p = avropt_fuse_move & 0x4; // Digit 2 in [0, 1].
bbinfo_t::try_split_ldi_p = avropt_fuse_move >> 3; // Digit 3 in [0, 2].
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index e475c37f10d..dd3a44bb72f 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -435,9 +435,11 @@ (define_insn "push<mode>1"
(define_insn "pushhi1_insn"
[(set (mem:HI (post_dec:HI (reg:HI REG_SP)))
- (match_operand:HI 0 "register_operand" "r"))]
+ (match_operand:HI 0 "reg_or_0_operand" "r,Y00"))]
""
- "push %B0\;push %A0"
+ "@
+ push %B0\;push %A0
+ push __zero_reg__\;push __zero_reg__"
[(set_attr "length" "2")])
;; All modes for a multi-byte push. We must include complex modes here too,
@@ -1014,6 +1016,9 @@ (define_split
// provided non-volatile, addr-space = generic, no reg-overlap
// and the resulting addressings are natively supported.
if (avropt_split_ldst
+ // Splitting too early may obfuscate some PRE_DEC / POST_INC
+ // opportunities, thus only split after avr-fuse-add.
+ && n_avr_fuse_add_executed > 0
&& GET_MODE_SIZE (<MODE>mode) > 1
&& avr_split_ldst (operands))
DONE;
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index e3c2adc2507..ad521b70ccb 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -904,9 +904,9 @@ Objective-C and Objective-C++ Dialects}.
-mbranch-cost=@var{cost} -mfuse-add=@var{level} -mfuse-move=@var{level}
-mcall-prologues -mgas-isr-prologues -mint8 -mflmap
-mdouble=@var{bits} -mlong-double=@var{bits}
--mn_flash=@var{size} -mno-interrupts
+-mn_flash=@var{size} -mfract-convert-truncate -mno-interrupts
-mmain-is-OS_task -mrelax -mrmw -mstrict-X -mtiny-stack
--mrodata-in-ram -mfract-convert-truncate -msplit-bit-shift
+-mrodata-in-ram -msplit-bit-shift -msplit-ldst
-mshort-calls -mskip-bug -nodevicelib -nodevicespecs
-Waddr-space-convert -Wmisspelled-isr}
@@ -24359,6 +24359,11 @@ This optimization is turned on per default for @option{-O2} and higher,
including @option{-Os} but excluding @option{-Oz}.
Splitting of shifts with a constant offset that is
a multiple of 8 is controlled by @option{-mfuse-move}.
+@opindex msplit-ldst
+
+@item -msplit-ldst
+Split multi-byte loads and stores into several byte loads and stores.
+This optimization is turned on per default for @option{-O2} and higher.
@opindex mtiny-stack
@item -mtiny-stack