This is an addendum to the -msplit-ldst patch.

When -msplit-ldst is on, it may be possible to propagate __zero_reg__
to the sources of the new stores.  For example, without this patch,

unsigned long lx;

void store_lsr17 (void)
{
   lx >>= 17;
}

compiles to:

store_lsr17:
   lds r26,lx+2           ;  movqi_insn
   lds r27,lx+3           ;  movqi_insn
   movw r24,r26           ;  *movhi
   lsr r25                ;  *lshrhi3_const
   ror r24
   ldi r26,0              ;  movqi_insn
   ldi r27,0              ;  movqi_insn
   sts lx,r24             ;  movqi_insn
   sts lx+1,r25           ;  movqi_insn
   sts lx+2,r26           ;  movqi_insn
   sts lx+3,r27           ;  movqi_insn
   ret

but with this patch it becomes:

store_lsr17:
   lds r26,lx+2           ;  movqi_insn
   lds r27,lx+3           ;  movqi_insn
   movw r24,r26           ;  *movhi
   lsr r25                ;  *lshrhi3_const
   ror r24
   sts lx,r24             ;  movqi_insn
   sts lx+1,r25           ;  movqi_insn
   sts lx+2,__zero_reg__  ;  movqi_insn
   sts lx+3,__zero_reg__  ;  movqi_insn
   ret

Ok for trunk?

Johann

--

AVR: target/107957 - Propagate zero_reg to store sources.

When -msplit-ldst is on, it may be possible to propagate __zero_reg__
to the sources of the new stores.  For example, without this patch,

unsigned long lx;

void store_lsr17 (void)
{
   lx >>= 17;
}

compiles to:

store_lsr17:
   lds r26,lx+2           ;  movqi_insn
   lds r27,lx+3           ;  movqi_insn
   movw r24,r26           ;  *movhi
   lsr r25                ;  *lshrhi3_const
   ror r24
   ldi r26,0              ;  movqi_insn
   ldi r27,0              ;  movqi_insn
   sts lx,r24             ;  movqi_insn
   sts lx+1,r25           ;  movqi_insn
   sts lx+2,r26           ;  movqi_insn
   sts lx+3,r27           ;  movqi_insn
   ret

but with this patch it becomes:

store_lsr17:
   lds r26,lx+2           ;  movqi_insn
   lds r27,lx+3           ;  movqi_insn
   movw r24,r26           ;  *movhi
   lsr r25                ;  *lshrhi3_const
   ror r24
   sts lx,r24             ;  movqi_insn
   sts lx+1,r25           ;  movqi_insn
   sts lx+2,__zero_reg__  ;  movqi_insn
   sts lx+3,__zero_reg__  ;  movqi_insn
   ret

gcc/
        PR target/107957
        * config/avr/avr-passes-fuse-move.h (bbinfo_t) <try_mem0_p>:
        Add static property.
        * config/avr/avr-passes.cc (bbinfo_t::try_mem0_p): Define it.
        (optimize_data_t::try_mem0): New method.
        (bbinfo_t::optimize_one_block) [bbinfo_t::try_mem0_p]: Run try_mem0.
        (bbinfo_t::optimize_one_function): Set bbinfo_t::try_mem0_p.
        * config/avr/avr.md (pushhi1_insn): Also allow zero as source.
        (define_split) [avropt_split_ldst]: Only run avr_split_ldst()
        when avr-fuse-move has been run at least once.
        * doc/invoke.texi (AVR Options) <-msplit-ldst>: Document it.
diff --git a/gcc/config/avr/avr-passes-fuse-move.h b/gcc/config/avr/avr-passes-fuse-move.h
index dbed1a636f3..432f9ca4670 100644
--- a/gcc/config/avr/avr-passes-fuse-move.h
+++ b/gcc/config/avr/avr-passes-fuse-move.h
@@ -1172,6 +1172,7 @@ struct bbinfo_t
 
   static find_plies_data_t *fpd;
   static bool try_fuse_p;
+  static bool try_mem0_p;
   static bool try_bin_arg1_p;
   static bool try_simplify_p;
   static bool try_split_ldi_p;
diff --git a/gcc/config/avr/avr-passes.cc b/gcc/config/avr/avr-passes.cc
index de8de1cd2e8..fad64b1b345 100644
--- a/gcc/config/avr/avr-passes.cc
+++ b/gcc/config/avr/avr-passes.cc
@@ -434,6 +434,11 @@ static machine_mode size_to_mode (int size)
       Split all insns where the operation can be performed on individual
       bytes, like andsi3.  In example (4) the andhi3 can be optimized
       to an andqi3.
+
+   bbinfo_t::try_mem0_p
+      Try to fuse a mem = reg insn to mem = __zero_reg__.
+      This should only occur when -msplit-ldst is on, but may
+      also occur with pushes since push<mode>1 splits them.
 */
 
 
@@ -514,6 +519,7 @@ bool bbinfo_t::try_split_any_p;
 bool bbinfo_t::try_simplify_p;
 bool bbinfo_t::use_arith_p;
 bool bbinfo_t::use_set_some_p;
+bool bbinfo_t::try_mem0_p;
 
 
 // Abstract Interpretation of expressions.
@@ -1087,6 +1093,7 @@ struct optimize_data_t
   {}
 
   bool try_fuse (bbinfo_t *);
+  bool try_mem0 (bbinfo_t *);
   bool try_bin_arg1 (bbinfo_t *);
   bool try_simplify (bbinfo_t *);
   bool try_split_ldi (bbinfo_t *);
@@ -2509,6 +2516,44 @@ bbinfo_t::run_find_plies (const insninfo_t &ii, const memento_t &memo) const
 }
 
 
+// Try to propagate __zero_reg__ to a mem = reg insn's source.
+// Returns true on success and sets .n_new_insns.
+bool
+optimize_data_t::try_mem0 (bbinfo_t *)
+{
+  rtx_insn *insn = curr.ii.m_insn;
+  rtx set, mem, reg;
+  machine_mode mode;
+
+  if (insn
+      && (set = single_set (insn))
+      && MEM_P (mem = SET_DEST (set))
+      && REG_P (reg = SET_SRC (set))
+      && GET_MODE_SIZE (mode = GET_MODE (mem)) <= 4
+      && END_REGNO (reg) <= REG_32
+      && ! (regmask (reg) & memento_t::fixed_regs_mask)
+      && curr.regs.have_value (REGNO (reg), GET_MODE_SIZE (mode), 0x0))
+    {
+      avr_dump (";; Found insn %d: mem:%m = 0 = r%d\n", INSN_UID (insn),
+		mode, REGNO (reg));
+
+      // Some insns like PUSHes don't clobber REG_CC.
+      bool clobbers_cc = GET_CODE (PATTERN (insn)) == PARALLEL;
+
+      if (clobbers_cc)
+	emit_valid_move_clobbercc (mem, CONST0_RTX (mode));
+      else
+	emit_valid_insn (gen_rtx_SET (mem, CONST0_RTX (mode)));
+
+      n_new_insns = 1;
+
+      return true;
+    }
+
+  return false;
+}
+
+
 // Try to fuse two 1-byte insns .prev and .curr to one 2-byte insn (MOVW).
 // Returns true on success, and sets .n_new_insns, .ignore_mask etc.
 bool
@@ -3108,7 +3153,8 @@ bbinfo_t::optimize_one_block (bool &changed)
 		    || (bbinfo_t::try_bin_arg1_p && od.try_bin_arg1 (this))
 		    || (bbinfo_t::try_simplify_p && od.try_simplify (this))
 		    || (bbinfo_t::try_split_ldi_p && od.try_split_ldi (this))
-		    || (bbinfo_t::try_split_any_p && od.try_split_any (this)));
+		    || (bbinfo_t::try_split_any_p && od.try_split_any (this))
+		    || (bbinfo_t::try_mem0_p && od.try_mem0 (this)));
 
       rtx_insn *new_insns = get_insns ();
       end_sequence ();
@@ -3193,6 +3239,7 @@ bbinfo_t::optimize_one_function (function *func)
 
   // Which optimization(s) to perform.
   bbinfo_t::try_fuse_p = avropt_fuse_move & 0x1;      // Digit 0 in [0, 1].
+  bbinfo_t::try_mem0_p = avropt_fuse_move & 0x1;      // Digit 0 in [0, 1].
   bbinfo_t::try_bin_arg1_p = avropt_fuse_move & 0x2;  // Digit 1 in [0, 1].
   bbinfo_t::try_split_any_p = avropt_fuse_move & 0x4; // Digit 2 in [0, 1].
   bbinfo_t::try_split_ldi_p = avropt_fuse_move >> 3;    // Digit 3 in [0, 2].
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index e475c37f10d..dd3a44bb72f 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -435,9 +435,11 @@ (define_insn "push<mode>1"
 
 (define_insn "pushhi1_insn"
   [(set (mem:HI (post_dec:HI (reg:HI REG_SP)))
-        (match_operand:HI 0 "register_operand" "r"))]
+        (match_operand:HI 0 "reg_or_0_operand" "r,Y00"))]
   ""
-  "push %B0\;push %A0"
+  "@
+	push %B0\;push %A0
+	push __zero_reg__\;push __zero_reg__"
   [(set_attr "length" "2")])
 
 ;; All modes for a multi-byte push.  We must include complex modes here too,
@@ -1014,6 +1016,9 @@ (define_split
     // provided non-volatile, addr-space = generic, no reg-overlap
     // and the resulting addressings are natively supported.
     if (avropt_split_ldst
+        // Splitting too early may obfuscate some PRE_DEC / POST_INC
+        // opportunities, thus only split after avr-fuse-add.
+        && n_avr_fuse_add_executed > 0
         && GET_MODE_SIZE (<MODE>mode) > 1
         && avr_split_ldst (operands))
       DONE;
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index e3c2adc2507..ad521b70ccb 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -904,9 +904,9 @@ Objective-C and Objective-C++ Dialects}.
 -mbranch-cost=@var{cost}  -mfuse-add=@var{level}  -mfuse-move=@var{level}
 -mcall-prologues  -mgas-isr-prologues  -mint8  -mflmap
 -mdouble=@var{bits}  -mlong-double=@var{bits}
--mn_flash=@var{size}  -mno-interrupts
+-mn_flash=@var{size}  -mfract-convert-truncate  -mno-interrupts
 -mmain-is-OS_task  -mrelax  -mrmw  -mstrict-X  -mtiny-stack
--mrodata-in-ram  -mfract-convert-truncate  -msplit-bit-shift
+-mrodata-in-ram  -msplit-bit-shift  -msplit-ldst
 -mshort-calls  -mskip-bug  -nodevicelib  -nodevicespecs
 -Waddr-space-convert  -Wmisspelled-isr}
 
@@ -24359,6 +24359,11 @@ This optimization is turned on per default for @option{-O2} and higher,
 including @option{-Os} but excluding @option{-Oz}.
 Splitting of shifts with a constant offset that is
 a multiple of 8 is controlled by @option{-mfuse-move}.
+@opindex msplit-ldst
+
+@item -msplit-ldst
+Split multi-byte loads and stores into several byte loads and stores.
+This optimization is turned on per default for @option{-O2} and higher.
 
 @opindex mtiny-stack
 @item -mtiny-stack

Reply via email to