thanks,

verified no regression on aarch64-none-elf bare-metal check-gcc/check-gdb.

-- Jiong

On 22/08/14 23:05, Richard Henderson wrote:
Delay cfi restore opcodes until the stack frame is deallocated.
This reduces the number of cfi advance opcodes required.

We perform a similar optimization in the x86_64 epilogue.


         * config/aarch64/aarch64.c (aarch64_popwb_single_reg): Remove.
         (aarch64_popwb_pair_reg): Remove.
         (aarch64_restore_callee_saves): Add CFI_OPS argument; fill it with
         the restore ops performed by the insns generated.
         (aarch64_expand_epilogue): Attach CFI_OPS to the stack deallocation
         insn.  Perform the calls_eh_return addition later; do not attempt to
         preserve the CFA in that case.  Don't use aarch64_set_frame_expr.
---
  gcc/config/aarch64/aarch64.c | 177 +++++++++++++------------------------------
  1 file changed, 52 insertions(+), 125 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index c3c871e..9a11e05 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1964,23 +1964,6 @@ aarch64_pushwb_single_reg (enum machine_mode mode, 
unsigned regno,
    RTX_FRAME_RELATED_P (insn) = 1;
  }

-static void
-aarch64_popwb_single_reg (enum machine_mode mode, unsigned regno,
-                         HOST_WIDE_INT adjustment)
-{
-  rtx base_rtx = stack_pointer_rtx;
-  rtx insn, reg, mem;
-
-  reg = gen_rtx_REG (mode, regno);
-  mem = gen_rtx_POST_MODIFY (Pmode, base_rtx,
-                            plus_constant (Pmode, base_rtx, adjustment));
-  mem = gen_rtx_MEM (mode, mem);
-
-  insn = emit_move_insn (reg, mem);
-  add_reg_note (insn, REG_CFA_RESTORE, reg);
-  RTX_FRAME_RELATED_P (insn) = 1;
-}
-
  static rtx
  aarch64_gen_storewb_pair (enum machine_mode mode, rtx base, rtx reg, rtx reg2,
                           HOST_WIDE_INT adjustment)
@@ -2011,7 +1994,6 @@ aarch64_pushwb_pair_reg (enum machine_mode mode, unsigned 
regno1,
    insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
                                               reg2, adjustment));
    RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
-
    RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
    RTX_FRAME_RELATED_P (insn) = 1;
  }
@@ -2033,29 +2015,6 @@ aarch64_gen_loadwb_pair (enum machine_mode mode, rtx 
base, rtx reg, rtx reg2,
      }
  }

-static void
-aarch64_popwb_pair_reg (enum machine_mode mode, unsigned regno1,
-                       unsigned regno2, HOST_WIDE_INT adjustment, rtx cfa)
-{
-  rtx insn;
-  rtx reg1 = gen_rtx_REG (mode, regno1);
-  rtx reg2 = gen_rtx_REG (mode, regno2);
-
-  insn = emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
-                                            reg2, adjustment));
-  RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
-  RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
-  RTX_FRAME_RELATED_P (insn) = 1;
-
-  if (cfa)
-    add_reg_note (insn, REG_CFA_ADJUST_CFA,
-                 (gen_rtx_SET (Pmode, stack_pointer_rtx,
-                               plus_constant (Pmode, cfa, adjustment))));
-
-  add_reg_note (insn, REG_CFA_RESTORE, reg1);
-  add_reg_note (insn, REG_CFA_RESTORE, reg2);
-}
-
  static rtx
  aarch64_gen_store_pair (enum machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
                         rtx reg2)
@@ -2151,9 +2110,8 @@ aarch64_save_callee_saves (enum machine_mode mode, 
HOST_WIDE_INT start_offset,
  static void
  aarch64_restore_callee_saves (enum machine_mode mode,
                               HOST_WIDE_INT start_offset, unsigned start,
-                             unsigned limit, bool skip_wb)
+                             unsigned limit, bool skip_wb, rtx *cfi_ops)
  {
-  rtx insn;
    rtx base_rtx = stack_pointer_rtx;
    rtx (*gen_mem_ref) (enum machine_mode, rtx) = (frame_pointer_needed
                                                  ? gen_frame_mem : 
gen_rtx_MEM);
@@ -2187,25 +2145,14 @@ aarch64_restore_callee_saves (enum machine_mode mode,

           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
-         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2,
-                                                  mem2));
-         add_reg_note (insn, REG_CFA_RESTORE, reg);
-         add_reg_note (insn, REG_CFA_RESTORE, reg2);
+         emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));

-         /* The first part of a frame-related parallel insn is
-            always assumed to be relevant to the frame
-            calculations; subsequent parts, are only
-            frame-related if explicitly marked.  */
-         RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
+         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
           regno = regno2;
         }
        else
-       {
-         insn = emit_move_insn (reg, mem);
-         add_reg_note (insn, REG_CFA_RESTORE, reg);
-       }
-
-      RTX_FRAME_RELATED_P (insn) = 1;
+       emit_move_insn (reg, mem);
+      *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
      }
  }

@@ -2418,7 +2365,6 @@ aarch64_expand_epilogue (bool for_sibcall)
    HOST_WIDE_INT frame_size, offset;
    HOST_WIDE_INT fp_offset;
    rtx insn;
-  rtx cfa_reg;

    aarch64_layout_frame ();

@@ -2426,8 +2372,6 @@ aarch64_expand_epilogue (bool for_sibcall)
    fp_offset = cfun->machine->frame.frame_size
               - cfun->machine->frame.hard_fp_offset;

-  cfa_reg = frame_pointer_needed ? hard_frame_pointer_rtx : stack_pointer_rtx;
-
    /* Store pairs and load pairs have a range only -512 to 504.  */
    if (offset >= 512)
      {
@@ -2459,11 +2403,6 @@ aarch64_expand_epilogue (bool for_sibcall)
                                        hard_frame_pointer_rtx,
                                        GEN_INT (0)));
        offset = offset - fp_offset;
-      RTX_FRAME_RELATED_P (insn) = 1;
-      /* As SP is set to (FP - fp_offset), according to the rules in
-        dwarf2cfi.c:dwarf2out_frame_debug_expr, CFA should be calculated
-        from the value of SP from now on.  */
-      cfa_reg = stack_pointer_rtx;
      }

    if (offset > 0)
@@ -2471,6 +2410,7 @@ aarch64_expand_epilogue (bool for_sibcall)
        unsigned reg1 = cfun->machine->frame.wb_candidate1;
        unsigned reg2 = cfun->machine->frame.wb_candidate2;
        bool skip_wb = true;
+      rtx cfi_ops = NULL;

        if (frame_pointer_needed)
         fp_offset = 0;
@@ -2481,99 +2421,86 @@ aarch64_expand_epilogue (bool for_sibcall)
         skip_wb = false;

        aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
-                                   skip_wb);
+                                   skip_wb, &cfi_ops);
        aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
-                                   skip_wb);
+                                   skip_wb, &cfi_ops);

        if (skip_wb)
         {
           enum machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
+         rtx rreg1 = gen_rtx_REG (mode1, reg1);

+         cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
           if (reg2 == FIRST_PSEUDO_REGISTER)
-           aarch64_popwb_single_reg (mode1, reg1, offset);
+           {
+             rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
+             mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
+             mem = gen_rtx_MEM (mode1, mem);
+             insn = emit_move_insn (rreg1, mem);
+           }
           else
             {
-             if (reg1 != HARD_FRAME_POINTER_REGNUM)
-               cfa_reg = NULL;
+             rtx rreg2 = gen_rtx_REG (mode1, reg2);

-             aarch64_popwb_pair_reg (mode1, reg1, reg2, offset, cfa_reg);
+             cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
+             insn = aarch64_gen_loadwb_pair (mode1, stack_pointer_rtx, rreg1,
+                                             rreg2, offset);
+             insn = emit_insn (insn);
             }
         }
        else
         {
           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
                                            GEN_INT (offset)));
-         RTX_FRAME_RELATED_P (insn) = 1;
         }
-    }
-
-  /* Stack adjustment for exception handler.  */
-  if (crtl->calls_eh_return)
-    {
-      /* We need to unwind the stack by the offset computed by
-        EH_RETURN_STACKADJ_RTX.  However, at this point the CFA is
-        based on SP.  Ideally we would update the SP and define the
-        CFA along the lines of:
-
-        SP = SP + EH_RETURN_STACKADJ_RTX
-        (regnote CFA = SP - EH_RETURN_STACKADJ_RTX)
-
-        However the dwarf emitter only understands a constant
-        register offset.
-
-        The solution chosen here is to use the otherwise unused IP0
-        as a temporary register to hold the current SP value.  The
-        CFA is described using IP0 then SP is modified.  */

-      rtx ip0 = gen_rtx_REG (DImode, IP0_REGNUM);
-
-      insn = emit_move_insn (ip0, stack_pointer_rtx);
-      add_reg_note (insn, REG_CFA_DEF_CFA, ip0);
+      /* Reset the CFA to be SP + FRAME_SIZE.  */
+      rtx new_cfa = stack_pointer_rtx;
+      if (frame_size > 0)
+       new_cfa = plus_constant (Pmode, new_cfa, frame_size);
+      cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
+      REG_NOTES (insn) = cfi_ops;
        RTX_FRAME_RELATED_P (insn) = 1;
-
-      emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
-
-      /* Ensure the assignment to IP0 does not get optimized away.  */
-      emit_use (ip0);
      }

-  if (frame_size > -1)
+  if (frame_size > 0)
      {
        if (frame_size >= 0x1000000)
         {
           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
           emit_move_insn (op0, GEN_INT (frame_size));
-         emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
-         aarch64_set_frame_expr (gen_rtx_SET
-                                 (Pmode, stack_pointer_rtx,
-                                  plus_constant (Pmode,
-                                                 stack_pointer_rtx,
-                                                 frame_size)));
+         insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
         }
-      else if (frame_size > 0)
+      else
         {
-         if ((frame_size & 0xfff) != 0)
-           {
-             insn = emit_insn (gen_add2_insn
-                               (stack_pointer_rtx,
-                                GEN_INT ((frame_size
-                                          & (HOST_WIDE_INT) 0xfff))));
-             RTX_FRAME_RELATED_P (insn) = 1;
-           }
-         if ((frame_size & 0xfff) != frame_size)
+          int hi_ofs = frame_size & 0xfff000;
+          int lo_ofs = frame_size & 0x000fff;
+
+         if (hi_ofs && lo_ofs)
             {
               insn = emit_insn (gen_add2_insn
-                               (stack_pointer_rtx,
-                                GEN_INT ((frame_size
-                                          & ~ (HOST_WIDE_INT) 0xfff))));
+                               (stack_pointer_rtx, GEN_INT (hi_ofs)));
               RTX_FRAME_RELATED_P (insn) = 1;
+             frame_size = lo_ofs;
             }
+         insn = emit_insn (gen_add2_insn
+                           (stack_pointer_rtx, GEN_INT (frame_size)));
         }

-      aarch64_set_frame_expr (gen_rtx_SET (Pmode, stack_pointer_rtx,
-                                          plus_constant (Pmode,
-                                                         stack_pointer_rtx,
-                                                         offset)));
+      /* Reset the CFA to be SP + 0.  */
+      add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
+
+  /* Stack adjustment for exception handler.  */
+  if (crtl->calls_eh_return)
+    {
+      /* We need to unwind the stack by the offset computed by
+        EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
+        to be SP; letting the CFA move during this adjustment
+        is just as correct as retaining the CFA from the body
+        of the function.  Therefore, do nothing special.  */
+      emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
      }

    emit_use (gen_rtx_REG (DImode, LR_REGNUM));
--
1.8.3.1





Reply via email to