On 29/07/16 12:49, Wilco Dijkstra wrote:
> This patch optimizes the prolog and epilog code to reduce the number of
> instructions and avoid multiple writes to SP.  The key idea is that epilogs
> are almost exact reverses of prologs, and thus all the decisions only need
> to be taken once.  The frame layout is decided in aarch64_layout_frame()
> and decisions recorded in the new aarch64_frame fields initial_adjust,
> callee_adjust, callee_offset and final_adjust.
> 
> A generic frame setup consists of 5 basic steps:
> 
> 1. sub sp, sp, initial_adjust
> 2. stp reg1, reg2, [sp, -callee_adjust]!      (push if callee_adjust != 0)
> 3. add fp, sp, callee_offset                  (if frame_pointer_needed)
> 4. stp reg3, reg4, [sp, callee_offset + N*16] (store remaining callee-saves)
> 5. sub sp, sp, final_adjust
> 
> The epilog reverses this, and may omit step 3 if alloca wasn't used.
> 
> Bootstrap, GCC & gdb regression OK.
> 
> ChangeLog:
> 2016-07-29  Wilco Dijkstra  <wdijk...@arm.com>
> 
> gcc/
>       * config/aarch64/aarch64.h (aarch64_frame):
>       Remove padding0 and hardfp_offset.  Add locals_offset,
>       initial_adjust, callee_adjust, callee_offset and final_adjust.
>       * config/aarch64/aarch64.c (aarch64_layout_frame):
>       Remove unused padding0 and hardfp_offset initializations.
>       Choose frame layout and set frame variables accordingly.
>       Use INVALID_REGNUM instead of FIRST_PSEUDO_REGISTER.
>       (aarch64_push_regs): Use INVALID_REGNUM, not FIRST_PSEUDO_REGISTER.
>       (aarch64_pop_regs): Likewise.
>       (aarch64_expand_prologue): Remove all decision code, just emit
>       prolog according to frame variables.
>       (aarch64_expand_epilogue): Remove all decision code, just emit
>       epilog according to frame variables.
>       (aarch64_initial_elimination_offset): Use offset to local/arg area.
> 
> testsuite/
>       * gcc.target/aarch64/test_frame_10.c: Fix test to check for a
>       single stack adjustment, no writeback.  
>       * gcc.target/aarch64/test_frame_12.c: Likewise.
>       * gcc.target/aarch64/test_frame_13.c: Likewise.
>       * gcc.target/aarch64/test_frame_15.c: Likewise.
>       * gcc.target/aarch64/test_frame_6.c: Likewise.
>       * gcc.target/aarch64/test_frame_7.c: Likewise.
>       * gcc.target/aarch64/test_frame_8.c: Likewise.
>       * gcc.target/aarch64/test_frame_16.c: New test.


Two minor nits, but otherwise OK.

R.

> ---
> 
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 
> 58959229c004e58405076b0e691b6b5634720140..455869f074dd72a38b6f8e1b199d83aa75b408b1
>  100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -550,11 +550,14 @@ struct GTY (()) aarch64_frame
>       STACK_BOUNDARY.  */
>    HOST_WIDE_INT saved_varargs_size;
>  
> +  /* The size of the saved callee-save int/FP registers.  */
> +
>    HOST_WIDE_INT saved_regs_size;
> -  /* Padding if needed after the all the callee save registers have
> -     been saved.  */
> -  HOST_WIDE_INT padding0;
> -  HOST_WIDE_INT hardfp_offset;       /* HARD_FRAME_POINTER_REGNUM */
> +
> +  /* Offset from the base of the frame (incomming SP) to the
> +     top of the locals area.  This value is always a multiple of
> +     STACK_BOUNDARY.  */
> +  HOST_WIDE_INT locals_offset;
>  
>    /* Offset from the base of the frame (incomming SP) to the
>       hard_frame_pointer.  This value is always a multiple of
> @@ -564,12 +567,25 @@ struct GTY (()) aarch64_frame
>    /* The size of the frame.  This value is the offset from base of the
>     * frame (incomming SP) to the stack_pointer.  This value is always
>     * a multiple of STACK_BOUNDARY.  */
> +  HOST_WIDE_INT frame_size;
> +
> +  /* The size of the initial stack adjustment before saving callee-saves.  */
> +  HOST_WIDE_INT initial_adjust;
> +
> +  /* The writeback value when pushing callee-save registers.
> +     It is zero when no push is used.  */
> +  HOST_WIDE_INT callee_adjust;
> +
> +  /* The offset from SP to the callee-save registers after initial_adjust.
> +     It may be non-zero if no push is used (ie. callee_adjust == 0).  */
> +  HOST_WIDE_INT callee_offset;
> +
> +  /* The size of the stack adjustment after saving callee-saves.  */
> +  HOST_WIDE_INT final_adjust;
>  
>    unsigned wb_candidate1;
>    unsigned wb_candidate2;
>  
> -  HOST_WIDE_INT frame_size;
> -
>    bool laid_out;
>  };
>  
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> 2023cb45fc8e87d94b48ae894bea78235056d4a4..7179dac29736409e1679e4bc932b95ba4c9aa1a5
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -2728,8 +2728,8 @@ aarch64_layout_frame (void)
>  #define SLOT_NOT_REQUIRED (-2)
>  #define SLOT_REQUIRED     (-1)
>  
> -  cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
> -  cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
> +  cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
> +  cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
>  
>    /* First mark all the registers that really need to be saved...  */
>    for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
> @@ -2763,7 +2763,6 @@ aarch64_layout_frame (void)
>        cfun->machine->frame.wb_candidate1 = R29_REGNUM;
>        cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
>        cfun->machine->frame.wb_candidate2 = R30_REGNUM;
> -      cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
>        offset += 2 * UNITS_PER_WORD;
>      }
>  
> @@ -2772,9 +2771,9 @@ aarch64_layout_frame (void)
>      if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
>        {
>       cfun->machine->frame.reg_offset[regno] = offset;
> -     if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
> +     if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
>         cfun->machine->frame.wb_candidate1 = regno;
> -     else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
> +     else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
>         cfun->machine->frame.wb_candidate2 = regno;
>       offset += UNITS_PER_WORD;
>        }
> @@ -2783,24 +2782,23 @@ aarch64_layout_frame (void)
>      if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
>        {
>       cfun->machine->frame.reg_offset[regno] = offset;
> -     if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
> +     if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
>         cfun->machine->frame.wb_candidate1 = regno;
> -     else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
> +     else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
>                && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
>         cfun->machine->frame.wb_candidate2 = regno;
>       offset += UNITS_PER_WORD;
>        }
>  
> -  cfun->machine->frame.padding0 =
> -    (ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
>    offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
>  
>    cfun->machine->frame.saved_regs_size = offset;
>  
> +  HOST_WIDE_INT varargs_and_saved_regs_size = offset
> +    + cfun->machine->frame.saved_varargs_size;

This should be written either as

HOST_WIDE_INT varargs_and_saved_regs_size
    = offset + cfun->machine->frame.saved_varargs_size;

or as

HOST_WIDE_INT varargs_and_saved_regs_size = (offset
                                             + 
cfun->machine->frame.saved_varargs_size);

Which form you use may depend on the overall line length.  In this case, I 
think the former is preferable.

> +
>    cfun->machine->frame.hard_fp_offset
> -    = ROUND_UP (cfun->machine->frame.saved_varargs_size
> -             + get_frame_size ()
> -             + cfun->machine->frame.saved_regs_size,
> +    = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
>               STACK_BOUNDARY / BITS_PER_UNIT);
>  
>    cfun->machine->frame.frame_size
> @@ -2808,6 +2806,77 @@ aarch64_layout_frame (void)
>               + crtl->outgoing_args_size,
>               STACK_BOUNDARY / BITS_PER_UNIT);
>  
> +  cfun->machine->frame.locals_offset = 
> cfun->machine->frame.saved_varargs_size;
> +
> +  cfun->machine->frame.initial_adjust = 0;
> +  cfun->machine->frame.final_adjust = 0;
> +  cfun->machine->frame.callee_adjust = 0;
> +  cfun->machine->frame.callee_offset = 0;
> +
> +  HOST_WIDE_INT max_push_offset = 0;
> +  if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
> +    max_push_offset = 512;
> +  else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
> +    max_push_offset = 256;
> +
> +  if (cfun->machine->frame.frame_size < max_push_offset
> +      && crtl->outgoing_args_size == 0)
> +    {
> +      /* Simple, small frame with no outgoing arguments:
> +      stp reg1, reg2, [sp, -frame_size]!
> +      stp reg3, reg4, [sp, 16]  */
> +      cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
> +    }
> +  else if (crtl->outgoing_args_size
> +        + cfun->machine->frame.saved_regs_size < 512

this sub-expression should be wrapped in parenthesis, so that the '+' indents 
more deeply than the && that follows.  Otherwise the logic can be slightly 
confusing.

> +        && !(cfun->calls_alloca
> +             && cfun->machine->frame.hard_fp_offset < max_push_offset))
> +    {
> +      /* Frame with small outgoing arguments:
> +      sub sp, sp, frame_size
> +      stp reg1, reg2, [sp, outgoing_args_size]
> +      stp reg3, reg4, [sp, outgoing_args_size + 16]  */
> +      cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
> +      cfun->machine->frame.callee_offset
> +     = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
> +    }
> +  else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
> +    {
> +      /* Frame with large outgoing arguments but a small local area:
> +      stp reg1, reg2, [sp, -hard_fp_offset]!
> +      stp reg3, reg4, [sp, 16]
> +      sub sp, sp, outgoing_args_size  */
> +      cfun->machine->frame.callee_adjust = 
> cfun->machine->frame.hard_fp_offset;
> +      cfun->machine->frame.final_adjust
> +     = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
> +    }
> +  else if (!frame_pointer_needed
> +        && varargs_and_saved_regs_size < max_push_offset)
> +    {
> +      /* Frame with large local area and outgoing arguments (this pushes the
> +      callee-saves first, followed by the locals and outgoing area):
> +      stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
> +      stp reg3, reg4, [sp, 16]
> +      sub sp, sp, frame_size - varargs_and_saved_regs_size  */
> +      cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
> +      cfun->machine->frame.final_adjust
> +     = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
> +      cfun->machine->frame.hard_fp_offset = 
> cfun->machine->frame.callee_adjust;
> +      cfun->machine->frame.locals_offset = 
> cfun->machine->frame.hard_fp_offset;
> +    }
> +  else
> +    {
> +      /* Frame with large local area and outgoing arguments using frame 
> pointer:
> +      sub sp, sp, hard_fp_offset
> +      stp x29, x30, [sp, 0]
> +      add x29, sp, 0
> +      stp reg3, reg4, [sp, 16]
> +      sub sp, sp, outgoing_args_size  */
> +      cfun->machine->frame.initial_adjust = 
> cfun->machine->frame.hard_fp_offset;
> +      cfun->machine->frame.final_adjust
> +     = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
> +    }
> +
>    cfun->machine->frame.laid_out = true;
>  }
>  
> @@ -2866,7 +2935,7 @@ aarch64_push_regs (unsigned regno1, unsigned regno2, 
> HOST_WIDE_INT adjustment)
>    rtx_insn *insn;
>    machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
>  
> -  if (regno2 == FIRST_PSEUDO_REGISTER)
> +  if (regno2 == INVALID_REGNUM)
>      return aarch64_pushwb_single_reg (mode, regno1, adjustment);
>  
>    rtx reg1 = gen_rtx_REG (mode, regno1);
> @@ -2905,7 +2974,7 @@ aarch64_pop_regs (unsigned regno1, unsigned regno2, 
> HOST_WIDE_INT adjustment,
>  
>    *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
>  
> -  if (regno2 == FIRST_PSEUDO_REGISTER)
> +  if (regno2 == INVALID_REGNUM)
>      {
>        rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
>        mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
> @@ -3106,23 +3175,16 @@ aarch64_restore_callee_saves (machine_mode mode,
>  void
>  aarch64_expand_prologue (void)
>  {
> -  /* sub sp, sp, #<frame_size>
> -     stp {fp, lr}, [sp, #<frame_size> - 16]
> -     add fp, sp, #<frame_size> - hardfp_offset
> -     stp {cs_reg}, [fp, #-16] etc.
> -
> -     sub sp, sp, <final_adjustment_if_any>
> -  */
> -  HOST_WIDE_INT frame_size, offset;
> -  HOST_WIDE_INT fp_offset;           /* Offset from hard FP to SP.  */
> -  HOST_WIDE_INT hard_fp_offset;
> -  rtx_insn *insn;
> -
>    aarch64_layout_frame ();
>  
> -  offset = frame_size = cfun->machine->frame.frame_size;
> -  hard_fp_offset = cfun->machine->frame.hard_fp_offset;
> -  fp_offset = frame_size - hard_fp_offset;
> +  HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
> +  HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
> +  HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
> +  HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
> +  HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
> +  unsigned reg1 = cfun->machine->frame.wb_candidate1;
> +  unsigned reg2 = cfun->machine->frame.wb_candidate2;
> +  rtx_insn *insn;
>  
>    if (flag_stack_usage_info)
>      current_function_static_stack_size = frame_size;
> @@ -3139,94 +3201,29 @@ aarch64_expand_prologue (void)
>       aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
>      }
>  
> -  /* Store pairs and load pairs have a range only -512 to 504.  */
> -  if (offset >= 512)
> -    {
> -      /* When the frame has a large size, an initial decrease is done on
> -      the stack pointer to jump over the callee-allocated save area for
> -      register varargs, the local variable area and/or the callee-saved
> -      register area.  This will allow the pre-index write-back
> -      store pair instructions to be used for setting up the stack frame
> -      efficiently.  */
> -      offset = hard_fp_offset;
> -      if (offset >= 512)
> -     offset = cfun->machine->frame.saved_regs_size;
> -
> -      frame_size -= (offset + crtl->outgoing_args_size);
> -      fp_offset = 0;
> +  aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -initial_adjust, true);
>  
> -      aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -frame_size, true);
> -    }
> -  else
> -    frame_size = -1;
> +  if (callee_adjust != 0)
> +    aarch64_push_regs (reg1, reg2, callee_adjust);
>  
> -  if (offset > 0)
> +  if (frame_pointer_needed)
>      {
> -      bool skip_wb = false;
> -
> -      if (frame_pointer_needed)
> -     {
> -       skip_wb = true;
> -
> -       if (fp_offset)
> -         {
> -           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
> -                                            GEN_INT (-offset)));
> -           RTX_FRAME_RELATED_P (insn) = 1;
> -
> -           aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
> -                                      R30_REGNUM, false);
> -         }
> -       else
> -         aarch64_push_regs (R29_REGNUM, R30_REGNUM, offset);
> -
> -       /* Set up frame pointer to point to the location of the
> -          previous frame pointer on the stack.  */
> -       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
> -                                        stack_pointer_rtx,
> -                                        GEN_INT (fp_offset)));
> -       RTX_FRAME_RELATED_P (insn) = 1;
> -       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
> -     }
> -      else
> -     {
> -       unsigned reg1 = cfun->machine->frame.wb_candidate1;
> -       unsigned reg2 = cfun->machine->frame.wb_candidate2;
> -
> -       if (fp_offset
> -           || reg1 == FIRST_PSEUDO_REGISTER
> -           || (reg2 == FIRST_PSEUDO_REGISTER
> -               && offset >= 256))
> -         {
> -           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
> -                                            GEN_INT (-offset)));
> -           RTX_FRAME_RELATED_P (insn) = 1;
> -         }
> -       else
> -         {
> -           aarch64_push_regs (reg1, reg2, offset);
> -           skip_wb = true;
> -         }
> -     }
> -
> -      aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
> -                              skip_wb);
> -      aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
> -                              skip_wb);
> +      if (callee_adjust == 0)
> +     aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
> +                                R30_REGNUM, false);
> +      insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
> +                                    stack_pointer_rtx,
> +                                    GEN_INT (callee_offset)));
> +      RTX_FRAME_RELATED_P (insn) = 1;
> +      emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
>      }
>  
> -  /* when offset >= 512,
> -     sub sp, sp, #<outgoing_args_size> */
> -  if (frame_size > -1)
> -    {
> -      if (crtl->outgoing_args_size > 0)
> -     {
> -       insn = emit_insn (gen_add2_insn
> -                         (stack_pointer_rtx,
> -                          GEN_INT (- crtl->outgoing_args_size)));
> -       RTX_FRAME_RELATED_P (insn) = 1;
> -     }
> -    }
> +  aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
> +                          callee_adjust != 0 || frame_pointer_needed);
> +  aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
> +                          callee_adjust != 0 || frame_pointer_needed);
> +  aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, -final_adjust,
> +                     !frame_pointer_needed);
>  }
>  
>  /* Return TRUE if we can use a simple_return insn.
> @@ -3249,104 +3246,80 @@ aarch64_use_return_insn_p (void)
>    return cfun->machine->frame.frame_size == 0;
>  }
>  
> -/* Generate the epilogue instructions for returning from a function.  */
> +/* Generate the epilogue instructions for returning from a function.
> +   This is almost exactly the reverse of the prolog sequence, except
> +   that we need to insert barriers to avoid scheduling loads that read
> +   from a deallocated stack, and we optimize the unwind records by
> +   emitting them all together if possible.  */
>  void
>  aarch64_expand_epilogue (bool for_sibcall)
>  {
> -  HOST_WIDE_INT frame_size, offset;
> -  HOST_WIDE_INT fp_offset;
> -  HOST_WIDE_INT hard_fp_offset;
> -  rtx_insn *insn;
> -  /* We need to add memory barrier to prevent read from deallocated stack.  
> */
> -  bool need_barrier_p = (get_frame_size () != 0
> -                      || cfun->machine->frame.saved_varargs_size);
> -
>    aarch64_layout_frame ();
>  
> -  offset = frame_size = cfun->machine->frame.frame_size;
> -  hard_fp_offset = cfun->machine->frame.hard_fp_offset;
> -  fp_offset = frame_size - hard_fp_offset;
> +  HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
> +  HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
> +  HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
> +  HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
> +  unsigned reg1 = cfun->machine->frame.wb_candidate1;
> +  unsigned reg2 = cfun->machine->frame.wb_candidate2;
> +  rtx cfi_ops = NULL;
> +  rtx_insn *insn;
>  
> -  /* Store pairs and load pairs have a range only -512 to 504.  */
> -  if (offset >= 512)
> -    {
> -      offset = hard_fp_offset;
> -      if (offset >= 512)
> -     offset = cfun->machine->frame.saved_regs_size;
> +  /* We need to add memory barrier to prevent read from deallocated stack.  
> */
> +  bool need_barrier_p = (get_frame_size ()
> +                      + cfun->machine->frame.saved_varargs_size) != 0;
>  
> -      frame_size -= (offset + crtl->outgoing_args_size);
> -      fp_offset = 0;
> -      if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
> -     {
> -       insn = emit_insn (gen_add2_insn
> -                         (stack_pointer_rtx,
> -                          GEN_INT (crtl->outgoing_args_size)));
> -       RTX_FRAME_RELATED_P (insn) = 1;
> -     }
> +  /* Emit a barrier to prevent loads from a deallocated stack.  */
> +  if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca)
> +    {
> +      emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
> +      need_barrier_p = false;
>      }
> -  else
> -    frame_size = -1;
>  
> -  /* If there were outgoing arguments or we've done dynamic stack
> -     allocation, then restore the stack pointer from the frame
> -     pointer.  This is at most one insn and more efficient than using
> -     GCC's internal mechanism.  */
> -  if (frame_pointer_needed
> -      && (crtl->outgoing_args_size || cfun->calls_alloca))
> +  /* Restore the stack pointer from the frame pointer if it may not
> +     be the same as the stack pointer.  */
> +  if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
>      {
> -      if (cfun->calls_alloca)
> -     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
> -
>        insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
>                                      hard_frame_pointer_rtx,
> -                                    GEN_INT (0)));
> -      offset = offset - fp_offset;
> +                                    GEN_INT (-callee_offset)));
> +      /* If writeback is used when restoring callee-saves, the CFA
> +      is restored on the instruction doing the writeback.  */
> +      RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
>      }
> +  else
> +    aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, final_adjust, true);
>  
> -  if (offset > 0)
> -    {
> -      unsigned reg1 = cfun->machine->frame.wb_candidate1;
> -      unsigned reg2 = cfun->machine->frame.wb_candidate2;
> -      bool skip_wb = true;
> -      rtx cfi_ops = NULL;
> -
> -      if (frame_pointer_needed)
> -     fp_offset = 0;
> -      else if (fp_offset
> -            || reg1 == FIRST_PSEUDO_REGISTER
> -            || (reg2 == FIRST_PSEUDO_REGISTER
> -                && offset >= 256))
> -     skip_wb = false;
> -
> -      aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
> -                                 skip_wb, &cfi_ops);
> -      aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
> -                                 skip_wb, &cfi_ops);
> +  aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
> +                             callee_adjust != 0, &cfi_ops);
> +  aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
> +                             callee_adjust != 0, &cfi_ops);
>  
> -      if (need_barrier_p)
> -     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
> +  if (need_barrier_p)
> +    emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
>  
> -      if (skip_wb)
> -     aarch64_pop_regs (reg1, reg2, offset, &cfi_ops);
> -      else
> -     emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (offset)));
> +  if (callee_adjust != 0)
> +    aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
>  
> -      /* Reset the CFA to be SP + FRAME_SIZE.  */
> -      rtx new_cfa = stack_pointer_rtx;
> -      if (frame_size > 0)
> -     new_cfa = plus_constant (Pmode, new_cfa, frame_size);
> -      cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
> +  if (callee_adjust != 0 || initial_adjust > 65536)
> +    {
> +      /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
>        insn = get_last_insn ();
> -      REG_NOTES (insn) = cfi_ops;
> +      rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
> +      REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
>        RTX_FRAME_RELATED_P (insn) = 1;
> +      cfi_ops = NULL;
>      }
>  
> -  if (frame_size > 0)
> -    {
> -      if (need_barrier_p)
> -     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
> +  aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, initial_adjust, true);
>  
> -      aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, frame_size, true);
> +  if (cfi_ops)
> +    {
> +      /* Emit delayed restores and reset the CFA to be SP.  */
> +      insn = get_last_insn ();
> +      cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
> +      REG_NOTES (insn) = cfi_ops;
> +      RTX_FRAME_RELATED_P (insn) = 1;
>      }
>  
>    /* Stack adjustment for exception handler.  */
> @@ -5173,18 +5146,18 @@ aarch64_initial_elimination_offset (unsigned from, 
> unsigned to)
>    if (to == HARD_FRAME_POINTER_REGNUM)
>      {
>        if (from == ARG_POINTER_REGNUM)
> -     return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
> +     return cfun->machine->frame.hard_fp_offset;
>  
>        if (from == FRAME_POINTER_REGNUM)
> -     return (cfun->machine->frame.hard_fp_offset
> -             - cfun->machine->frame.saved_varargs_size);
> +     return cfun->machine->frame.hard_fp_offset
> +            - cfun->machine->frame.locals_offset;
>      }
>  
>    if (to == STACK_POINTER_REGNUM)
>      {
>        if (from == FRAME_POINTER_REGNUM)
> -       return (cfun->machine->frame.frame_size
> -               - cfun->machine->frame.saved_varargs_size);
> +       return cfun->machine->frame.frame_size
> +              - cfun->machine->frame.locals_offset;
>      }
>  
>    return cfun->machine->frame.frame_size;
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_10.c 
> b/gcc/testsuite/gcc.target/aarch64/test_frame_10.c
> index 
> 70dd6539af93a034ae64f8603089c6d6f59a6b53..e23a4a83528b71a0de0c95752a9e530bf4ca79e5
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_10.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_10.c
> @@ -4,8 +4,7 @@
>       * total frame size > 512.
>         area except outgoing <= 512
>       * number of callee-saved reg >= 2.
> -     * Split stack adjustment into two subtractions.
> -       the first subtractions could be optimized into "stp !".  */
> +     * Use a single stack adjustment, no writeback.  */
>  
>  /* { dg-do run } */
>  /* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
> @@ -15,6 +14,6 @@
>  t_frame_pattern_outgoing (test10, 480, "x19", 24, a[8], a[9], a[10])
>  t_frame_run (test10)
>  
> -/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!" 
> 1 } } */
> -/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\], \[0-9\]+" 1 
> } } */
> +/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, \[0-9\]+\\\]" 1 
> } } */
> +/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp, \[0-9\]+\\\]" 1 
> } } */
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_12.c 
> b/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
> index 
> 2353477c29ea99c56e73a34cf0449cf6c669e973..3d7d3594610c645d2d6f449b6ee0400fdd395849
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_12.c
> @@ -13,6 +13,6 @@ t_frame_run (test12)
>  
>  /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
>  
> -/* Check epilogue using write-back.  */
> -/* { dg-final { scan-assembler-times "ldp\tx29, x30, \\\[sp\\\], \[0-9\]+" 3 
> } } */
> +/* Check epilogue using no write-back.  */
> +/* { dg-final { scan-assembler-times "ldp\tx29, x30, \\\[sp, \[0-9\]+\\\]" 1 
> } } */
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_13.c 
> b/gcc/testsuite/gcc.target/aarch64/test_frame_13.c
> index 
> f3aa263929421db12b78abc733e2b011db3a4e48..74b3370fa463b652265e00fff80cc8856524d509
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_13.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_13.c
> @@ -2,8 +2,7 @@
>       * without outgoing.
>       * total frame size > 512.
>       * number of callee-save reg >= 2.
> -     * split the stack adjustment into two substractions,
> -       the second could be optimized into "stp !".  */
> +     * Use a single stack adjustment, no writeback.  */
>  
>  /* { dg-do run } */
>  /* { dg-options "-O2 --save-temps" } */
> @@ -14,4 +13,4 @@ t_frame_pattern (test13, 700, )
>  t_frame_run (test13)
>  
>  /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
> -/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, -\[0-9\]+\\\]!" 
> 2 } } */
> +/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp\\\]" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_15.c 
> b/gcc/testsuite/gcc.target/aarch64/test_frame_15.c
> index 
> fc6f713232de52b72ba5c3eef92e1aea6526199d..bed6714b4fe529a3b81ad8c5253924aa97bf8806
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_15.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_15.c
> @@ -3,8 +3,7 @@
>       * total frame size > 512.
>         area except outgoing <= 512
>       * number of callee-save reg >= 2.
> -     * split the stack adjustment into two substractions,
> -       the first could be optimized into "stp !".  */
> +     * Use a single stack adjustment, no writeback.  */
>  
>  /* { dg-do run } */
>  /* { dg-options "-O2 --save-temps" } */
> @@ -15,4 +14,4 @@ t_frame_pattern_outgoing (test15, 480, , 8, a[8])
>  t_frame_run (test15)
>  
>  /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */
> -/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, -\[0-9\]+\\\]!" 
> 3 } } */
> +/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, \[0-9\]+\\\]" 1 
> } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_16.c 
> b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..28f3826adadd5eaa6486659e4d6b6d7c5960b9d2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
> @@ -0,0 +1,25 @@
> +/* Verify:
> +     * with outgoing.
> +     * single int register push.
> +     * varargs and callee-save size >= 256
> +     * Use 2 stack adjustments.  */
> +
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
> +
> +#define REP8(X) X,X,X,X,X,X,X,X
> +#define REP64(X) REP8(REP8(X))
> +
> +void outgoing (__builtin_va_list, ...);
> +
> +double vararg_outgoing (int x1, ...)
> +{
> +  double a1 = x1, a2 = x1 * 2, a3 = x1 * 3, a4 = x1 * 4, a5 = x1 * 5, a6 = 
> x1 * 6;
> +  __builtin_va_list vl;
> +  __builtin_va_start (vl, x1);
> +  outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1));
> +  __builtin_va_end (vl);
> +  return a1 + a2 + a3 + a4 + a5 + a6;
> +}
> +
> +/* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_6.c 
> b/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
> index 
> d8481346c58458934deecb4b7f38fb5821517b56..6a753dff87e28fa71a2f69df5fb95559163fa6cd
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_6.c
> @@ -3,8 +3,7 @@
>       * without outgoing.
>       * total frame size > 512.
>       * number of callee-saved reg == 1.
> -     * split stack adjustment into two subtractions.
> -       the second subtraction should use "str !".  */
> +     * use a single stack adjustment, no writeback.  */
>  
>  /* { dg-do run } */
>  /* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
> @@ -14,6 +13,7 @@
>  t_frame_pattern (test6, 700, )
>  t_frame_run (test6)
>  
> -/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 2 } 
> } */
> -/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\], \[0-9\]+" 2 } } 
> */
> +/* { dg-final { scan-assembler-times "str\tx30, \\\[sp\\\]" 1 } } */
> +/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\]" 2 } } */
> +/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\]," 1 } } */
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_7.c 
> b/gcc/testsuite/gcc.target/aarch64/test_frame_7.c
> index 
> d87d68b3eec72dd23b279ea94391a400c9ae5a9a..f2a8713d19d9f7df49073e9588c5d74661491fb6
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_7.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_7.c
> @@ -3,8 +3,7 @@
>       * without outgoing.
>       * total frame size > 512.
>       * number of callee-saved reg == 2.
> -     * split stack adjustment into two subtractions.
> -       the second subtraction should use "stp !".  */
> +     * use a single stack adjustment, no writeback.  */
>  
>  /* { dg-do run } */
>  /* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */
> @@ -14,6 +13,6 @@
>  t_frame_pattern (test7, 700, "x19")
>  t_frame_run (test7)
>  
> -/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!" 
> 1 } } */
> -/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\], \[0-9\]+" 1 
> } } */
> +/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp]" 1 } } */
> +/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\]" 1 } } */
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_8.c 
> b/gcc/testsuite/gcc.target/aarch64/test_frame_8.c
> index 
> 435d9d59e68d71b1d4c56f1beca5fb1bce4f39b8..9b6c6939eb5c3ae1bdcab7fb854b6c519f054c20
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_8.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_8.c
> @@ -12,6 +12,6 @@
>  t_frame_pattern_outgoing (test8, 700, , 8, a[8])
>  t_frame_run (test8)
>  
> -/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 3 } 
> } */
> -/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\], \[0-9\]+" 3 } } 
> */
> +/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, \[0-9\]+\\\]" 1 } } 
> */
> +/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp, \[0-9\]+\\\]" 1 } } 
> */
> 

Reply via email to