On 29/07/16 12:49, Wilco Dijkstra wrote: > This patch optimizes the prolog and epilog code to reduce the number of > instructions and avoid multiple writes to SP. The key idea is that epilogs > are almost exact reverses of prologs, and thus all the decisions only need > to be taken once. The frame layout is decided in aarch64_layout_frame() > and decisions recorded in the new aarch64_frame fields initial_adjust, > callee_adjust, callee_offset and final_adjust. > > A generic frame setup consists of 5 basic steps: > > 1. sub sp, sp, initial_adjust > 2. stp reg1, reg2, [sp, -callee_adjust]! (push if callee_adjust != 0) > 3. add fp, sp, callee_offset (if frame_pointer_needed) > 4. stp reg3, reg4, [sp, callee_offset + N*16] (store remaining callee-saves) > 5. sub sp, sp, final_adjust > > The epilog reverses this, and may omit step 3 if alloca wasn't used. > > Bootstrap, GCC & gdb regression OK. > > ChangeLog: > 2016-07-29 Wilco Dijkstra <wdijk...@arm.com> > > gcc/ > * config/aarch64/aarch64.h (aarch64_frame): > Remove padding0 and hardfp_offset. Add locals_offset, > initial_adjust, callee_adjust, callee_offset and final_adjust. > * config/aarch64/aarch64.c (aarch64_layout_frame): > Remove unused padding0 and hardfp_offset initializations. > Choose frame layout and set frame variables accordingly. > Use INVALID_REGNUM instead of FIRST_PSEUDO_REGISTER. > (aarch64_push_regs): Use INVALID_REGNUM, not FIRST_PSEUDO_REGISTER. > (aarch64_pop_regs): Likewise. > (aarch64_expand_prologue): Remove all decision code, just emit > prolog according to frame variables. > (aarch64_expand_epilogue): Remove all decision code, just emit > epilog according to frame variables. > (aarch64_initial_elimination_offset): Use offset to local/arg area. > > testsuite/ > * gcc.target/aarch64/test_frame_10.c: Fix test to check for a > single stack adjustment, no writeback. > * gcc.target/aarch64/test_frame_12.c: Likewise. > * gcc.target/aarch64/test_frame_13.c: Likewise. > * gcc.target/aarch64/test_frame_15.c: Likewise. > * gcc.target/aarch64/test_frame_6.c: Likewise. > * gcc.target/aarch64/test_frame_7.c: Likewise. > * gcc.target/aarch64/test_frame_8.c: Likewise. > * gcc.target/aarch64/test_frame_16.c: New test.
Two minor nits, but otherwise OK. R. > --- > > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > index > 58959229c004e58405076b0e691b6b5634720140..455869f074dd72a38b6f8e1b199d83aa75b408b1 > 100644 > --- a/gcc/config/aarch64/aarch64.h > +++ b/gcc/config/aarch64/aarch64.h > @@ -550,11 +550,14 @@ struct GTY (()) aarch64_frame > STACK_BOUNDARY. */ > HOST_WIDE_INT saved_varargs_size; > > + /* The size of the saved callee-save int/FP registers. */ > + > HOST_WIDE_INT saved_regs_size; > - /* Padding if needed after the all the callee save registers have > - been saved. */ > - HOST_WIDE_INT padding0; > - HOST_WIDE_INT hardfp_offset; /* HARD_FRAME_POINTER_REGNUM */ > + > + /* Offset from the base of the frame (incomming SP) to the > + top of the locals area. This value is always a multiple of > + STACK_BOUNDARY. */ > + HOST_WIDE_INT locals_offset; > > /* Offset from the base of the frame (incomming SP) to the > hard_frame_pointer. This value is always a multiple of > @@ -564,12 +567,25 @@ struct GTY (()) aarch64_frame > /* The size of the frame. This value is the offset from base of the > * frame (incomming SP) to the stack_pointer. This value is always > * a multiple of STACK_BOUNDARY. */ > + HOST_WIDE_INT frame_size; > + > + /* The size of the initial stack adjustment before saving callee-saves. */ > + HOST_WIDE_INT initial_adjust; > + > + /* The writeback value when pushing callee-save registers. > + It is zero when no push is used. */ > + HOST_WIDE_INT callee_adjust; > + > + /* The offset from SP to the callee-save registers after initial_adjust. > + It may be non-zero if no push is used (ie. callee_adjust == 0). */ > + HOST_WIDE_INT callee_offset; > + > + /* The size of the stack adjustment after saving callee-saves. */ > + HOST_WIDE_INT final_adjust; > > unsigned wb_candidate1; > unsigned wb_candidate2; > > - HOST_WIDE_INT frame_size; > - > bool laid_out; > }; > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index > 2023cb45fc8e87d94b48ae894bea78235056d4a4..7179dac29736409e1679e4bc932b95ba4c9aa1a5 > 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -2728,8 +2728,8 @@ aarch64_layout_frame (void) > #define SLOT_NOT_REQUIRED (-2) > #define SLOT_REQUIRED (-1) > > - cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER; > - cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER; > + cfun->machine->frame.wb_candidate1 = INVALID_REGNUM; > + cfun->machine->frame.wb_candidate2 = INVALID_REGNUM; > > /* First mark all the registers that really need to be saved... */ > for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++) > @@ -2763,7 +2763,6 @@ aarch64_layout_frame (void) > cfun->machine->frame.wb_candidate1 = R29_REGNUM; > cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD; > cfun->machine->frame.wb_candidate2 = R30_REGNUM; > - cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD; > offset += 2 * UNITS_PER_WORD; > } > > @@ -2772,9 +2771,9 @@ aarch64_layout_frame (void) > if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED) > { > cfun->machine->frame.reg_offset[regno] = offset; > - if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER) > + if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM) > cfun->machine->frame.wb_candidate1 = regno; > - else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER) > + else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM) > cfun->machine->frame.wb_candidate2 = regno; > offset += UNITS_PER_WORD; > } > @@ -2783,24 +2782,23 @@ aarch64_layout_frame (void) > if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED) > { > cfun->machine->frame.reg_offset[regno] = offset; > - if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER) > + if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM) > cfun->machine->frame.wb_candidate1 = regno; > - else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER > + else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM > && cfun->machine->frame.wb_candidate1 >= V0_REGNUM) > cfun->machine->frame.wb_candidate2 = regno; > offset += UNITS_PER_WORD; > } > > - cfun->machine->frame.padding0 = > - (ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset); > offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT); > > cfun->machine->frame.saved_regs_size = offset; > > + HOST_WIDE_INT varargs_and_saved_regs_size = offset > + + cfun->machine->frame.saved_varargs_size; This should be written either as HOST_WIDE_INT varargs_and_saved_regs_size = offset + cfun->machine->frame.saved_varargs_size; or as HOST_WIDE_INT varargs_and_saved_regs_size = (offset + cfun->machine->frame.saved_varargs_size); Which form you use may depend on the overall line length. In this case, I think the former is preferable. > + > cfun->machine->frame.hard_fp_offset > - = ROUND_UP (cfun->machine->frame.saved_varargs_size > - + get_frame_size () > - + cfun->machine->frame.saved_regs_size, > + = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (), > STACK_BOUNDARY / BITS_PER_UNIT); > > cfun->machine->frame.frame_size > @@ -2808,6 +2806,77 @@ aarch64_layout_frame (void) > + crtl->outgoing_args_size, > STACK_BOUNDARY / BITS_PER_UNIT); > > + cfun->machine->frame.locals_offset = > cfun->machine->frame.saved_varargs_size; > + > + cfun->machine->frame.initial_adjust = 0; > + cfun->machine->frame.final_adjust = 0; > + cfun->machine->frame.callee_adjust = 0; > + cfun->machine->frame.callee_offset = 0; > + > + HOST_WIDE_INT max_push_offset = 0; > + if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM) > + max_push_offset = 512; > + else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM) > + max_push_offset = 256; > + > + if (cfun->machine->frame.frame_size < max_push_offset > + && crtl->outgoing_args_size == 0) > + { > + /* Simple, small frame with no outgoing arguments: > + stp reg1, reg2, [sp, -frame_size]! > + stp reg3, reg4, [sp, 16] */ > + cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size; > + } > + else if (crtl->outgoing_args_size > + + cfun->machine->frame.saved_regs_size < 512 this sub-expression should be wrapped in parenthesis, so that the '+' indents more deeply than the && that follows. Otherwise the logic can be slightly confusing. > + && !(cfun->calls_alloca > + && cfun->machine->frame.hard_fp_offset < max_push_offset)) > + { > + /* Frame with small outgoing arguments: > + sub sp, sp, frame_size > + stp reg1, reg2, [sp, outgoing_args_size] > + stp reg3, reg4, [sp, outgoing_args_size + 16] */ > + cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size; > + cfun->machine->frame.callee_offset > + = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset; > + } > + else if (cfun->machine->frame.hard_fp_offset < max_push_offset) > + { > + /* Frame with large outgoing arguments but a small local area: > + stp reg1, reg2, [sp, -hard_fp_offset]! > + stp reg3, reg4, [sp, 16] > + sub sp, sp, outgoing_args_size */ > + cfun->machine->frame.callee_adjust = > cfun->machine->frame.hard_fp_offset; > + cfun->machine->frame.final_adjust > + = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust; > + } > + else if (!frame_pointer_needed > + && varargs_and_saved_regs_size < max_push_offset) > + { > + /* Frame with large local area and outgoing arguments (this pushes the > + callee-saves first, followed by the locals and outgoing area): > + stp reg1, reg2, [sp, -varargs_and_saved_regs_size]! > + stp reg3, reg4, [sp, 16] > + sub sp, sp, frame_size - varargs_and_saved_regs_size */ > + cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size; > + cfun->machine->frame.final_adjust > + = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust; > + cfun->machine->frame.hard_fp_offset = > cfun->machine->frame.callee_adjust; > + cfun->machine->frame.locals_offset = > cfun->machine->frame.hard_fp_offset; > + } > + else > + { > + /* Frame with large local area and outgoing arguments using frame > pointer: > + sub sp, sp, hard_fp_offset > + stp x29, x30, [sp, 0] > + add x29, sp, 0 > + stp reg3, reg4, [sp, 16] > + sub sp, sp, outgoing_args_size */ > + cfun->machine->frame.initial_adjust = > cfun->machine->frame.hard_fp_offset; > + cfun->machine->frame.final_adjust > + = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust; > + } > + > cfun->machine->frame.laid_out = true; > } > > @@ -2866,7 +2935,7 @@ aarch64_push_regs (unsigned regno1, unsigned regno2, > HOST_WIDE_INT adjustment) > rtx_insn *insn; > machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode; > > - if (regno2 == FIRST_PSEUDO_REGISTER) > + if (regno2 == INVALID_REGNUM) > return aarch64_pushwb_single_reg (mode, regno1, adjustment); > > rtx reg1 = gen_rtx_REG (mode, regno1); > @@ -2905,7 +2974,7 @@ aarch64_pop_regs (unsigned regno1, unsigned regno2, > HOST_WIDE_INT adjustment, > > *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops); > > - if (regno2 == FIRST_PSEUDO_REGISTER) > + if (regno2 == INVALID_REGNUM) > { > rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment); > mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem); > @@ -3106,23 +3175,16 @@ aarch64_restore_callee_saves (machine_mode mode, > void > aarch64_expand_prologue (void) > { > - /* sub sp, sp, #<frame_size> > - stp {fp, lr}, [sp, #<frame_size> - 16] > - add fp, sp, #<frame_size> - hardfp_offset > - stp {cs_reg}, [fp, #-16] etc. > - > - sub sp, sp, <final_adjustment_if_any> > - */ > - HOST_WIDE_INT frame_size, offset; > - HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */ > - HOST_WIDE_INT hard_fp_offset; > - rtx_insn *insn; > - > aarch64_layout_frame (); > > - offset = frame_size = cfun->machine->frame.frame_size; > - hard_fp_offset = cfun->machine->frame.hard_fp_offset; > - fp_offset = frame_size - hard_fp_offset; > + HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size; > + HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust; > + HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust; > + HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust; > + HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset; > + unsigned reg1 = cfun->machine->frame.wb_candidate1; > + unsigned reg2 = cfun->machine->frame.wb_candidate2; > + rtx_insn *insn; > > if (flag_stack_usage_info) > current_function_static_stack_size = frame_size; > @@ -3139,94 +3201,29 @@ aarch64_expand_prologue (void) > aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size); > } > > - /* Store pairs and load pairs have a range only -512 to 504. */ > - if (offset >= 512) > - { > - /* When the frame has a large size, an initial decrease is done on > - the stack pointer to jump over the callee-allocated save area for > - register varargs, the local variable area and/or the callee-saved > - register area. This will allow the pre-index write-back > - store pair instructions to be used for setting up the stack frame > - efficiently. */ > - offset = hard_fp_offset; > - if (offset >= 512) > - offset = cfun->machine->frame.saved_regs_size; > - > - frame_size -= (offset + crtl->outgoing_args_size); > - fp_offset = 0; > + aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -initial_adjust, true); > > - aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -frame_size, true); > - } > - else > - frame_size = -1; > + if (callee_adjust != 0) > + aarch64_push_regs (reg1, reg2, callee_adjust); > > - if (offset > 0) > + if (frame_pointer_needed) > { > - bool skip_wb = false; > - > - if (frame_pointer_needed) > - { > - skip_wb = true; > - > - if (fp_offset) > - { > - insn = emit_insn (gen_add2_insn (stack_pointer_rtx, > - GEN_INT (-offset))); > - RTX_FRAME_RELATED_P (insn) = 1; > - > - aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM, > - R30_REGNUM, false); > - } > - else > - aarch64_push_regs (R29_REGNUM, R30_REGNUM, offset); > - > - /* Set up frame pointer to point to the location of the > - previous frame pointer on the stack. */ > - insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx, > - stack_pointer_rtx, > - GEN_INT (fp_offset))); > - RTX_FRAME_RELATED_P (insn) = 1; > - emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx)); > - } > - else > - { > - unsigned reg1 = cfun->machine->frame.wb_candidate1; > - unsigned reg2 = cfun->machine->frame.wb_candidate2; > - > - if (fp_offset > - || reg1 == FIRST_PSEUDO_REGISTER > - || (reg2 == FIRST_PSEUDO_REGISTER > - && offset >= 256)) > - { > - insn = emit_insn (gen_add2_insn (stack_pointer_rtx, > - GEN_INT (-offset))); > - RTX_FRAME_RELATED_P (insn) = 1; > - } > - else > - { > - aarch64_push_regs (reg1, reg2, offset); > - skip_wb = true; > - } > - } > - > - aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM, > - skip_wb); > - aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM, > - skip_wb); > + if (callee_adjust == 0) > + aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM, > + R30_REGNUM, false); > + insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx, > + stack_pointer_rtx, > + GEN_INT (callee_offset))); > + RTX_FRAME_RELATED_P (insn) = 1; > + emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx)); > } > > - /* when offset >= 512, > - sub sp, sp, #<outgoing_args_size> */ > - if (frame_size > -1) > - { > - if (crtl->outgoing_args_size > 0) > - { > - insn = emit_insn (gen_add2_insn > - (stack_pointer_rtx, > - GEN_INT (- crtl->outgoing_args_size))); > - RTX_FRAME_RELATED_P (insn) = 1; > - } > - } > + aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM, > + callee_adjust != 0 || frame_pointer_needed); > + aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM, > + callee_adjust != 0 || frame_pointer_needed); > + aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, -final_adjust, > + !frame_pointer_needed); > } > > /* Return TRUE if we can use a simple_return insn. > @@ -3249,104 +3246,80 @@ aarch64_use_return_insn_p (void) > return cfun->machine->frame.frame_size == 0; > } > > -/* Generate the epilogue instructions for returning from a function. */ > +/* Generate the epilogue instructions for returning from a function. > + This is almost exactly the reverse of the prolog sequence, except > + that we need to insert barriers to avoid scheduling loads that read > + from a deallocated stack, and we optimize the unwind records by > + emitting them all together if possible. */ > void > aarch64_expand_epilogue (bool for_sibcall) > { > - HOST_WIDE_INT frame_size, offset; > - HOST_WIDE_INT fp_offset; > - HOST_WIDE_INT hard_fp_offset; > - rtx_insn *insn; > - /* We need to add memory barrier to prevent read from deallocated stack. > */ > - bool need_barrier_p = (get_frame_size () != 0 > - || cfun->machine->frame.saved_varargs_size); > - > aarch64_layout_frame (); > > - offset = frame_size = cfun->machine->frame.frame_size; > - hard_fp_offset = cfun->machine->frame.hard_fp_offset; > - fp_offset = frame_size - hard_fp_offset; > + HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust; > + HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust; > + HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust; > + HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset; > + unsigned reg1 = cfun->machine->frame.wb_candidate1; > + unsigned reg2 = cfun->machine->frame.wb_candidate2; > + rtx cfi_ops = NULL; > + rtx_insn *insn; > > - /* Store pairs and load pairs have a range only -512 to 504. */ > - if (offset >= 512) > - { > - offset = hard_fp_offset; > - if (offset >= 512) > - offset = cfun->machine->frame.saved_regs_size; > + /* We need to add memory barrier to prevent read from deallocated stack. > */ > + bool need_barrier_p = (get_frame_size () > + + cfun->machine->frame.saved_varargs_size) != 0; > > - frame_size -= (offset + crtl->outgoing_args_size); > - fp_offset = 0; > - if (!frame_pointer_needed && crtl->outgoing_args_size > 0) > - { > - insn = emit_insn (gen_add2_insn > - (stack_pointer_rtx, > - GEN_INT (crtl->outgoing_args_size))); > - RTX_FRAME_RELATED_P (insn) = 1; > - } > + /* Emit a barrier to prevent loads from a deallocated stack. */ > + if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca) > + { > + emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx)); > + need_barrier_p = false; > } > - else > - frame_size = -1; > > - /* If there were outgoing arguments or we've done dynamic stack > - allocation, then restore the stack pointer from the frame > - pointer. This is at most one insn and more efficient than using > - GCC's internal mechanism. */ > - if (frame_pointer_needed > - && (crtl->outgoing_args_size || cfun->calls_alloca)) > + /* Restore the stack pointer from the frame pointer if it may not > + be the same as the stack pointer. */ > + if (frame_pointer_needed && (final_adjust || cfun->calls_alloca)) > { > - if (cfun->calls_alloca) > - emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx)); > - > insn = emit_insn (gen_add3_insn (stack_pointer_rtx, > hard_frame_pointer_rtx, > - GEN_INT (0))); > - offset = offset - fp_offset; > + GEN_INT (-callee_offset))); > + /* If writeback is used when restoring callee-saves, the CFA > + is restored on the instruction doing the writeback. */ > + RTX_FRAME_RELATED_P (insn) = callee_adjust == 0; > } > + else > + aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, final_adjust, true); > > - if (offset > 0) > - { > - unsigned reg1 = cfun->machine->frame.wb_candidate1; > - unsigned reg2 = cfun->machine->frame.wb_candidate2; > - bool skip_wb = true; > - rtx cfi_ops = NULL; > - > - if (frame_pointer_needed) > - fp_offset = 0; > - else if (fp_offset > - || reg1 == FIRST_PSEUDO_REGISTER > - || (reg2 == FIRST_PSEUDO_REGISTER > - && offset >= 256)) > - skip_wb = false; > - > - aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM, > - skip_wb, &cfi_ops); > - aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM, > - skip_wb, &cfi_ops); > + aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM, > + callee_adjust != 0, &cfi_ops); > + aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM, > + callee_adjust != 0, &cfi_ops); > > - if (need_barrier_p) > - emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx)); > + if (need_barrier_p) > + emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx)); > > - if (skip_wb) > - aarch64_pop_regs (reg1, reg2, offset, &cfi_ops); > - else > - emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (offset))); > + if (callee_adjust != 0) > + aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops); > > - /* Reset the CFA to be SP + FRAME_SIZE. */ > - rtx new_cfa = stack_pointer_rtx; > - if (frame_size > 0) > - new_cfa = plus_constant (Pmode, new_cfa, frame_size); > - cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops); > + if (callee_adjust != 0 || initial_adjust > 65536) > + { > + /* Emit delayed restores and set the CFA to be SP + initial_adjust. */ > insn = get_last_insn (); > - REG_NOTES (insn) = cfi_ops; > + rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust); > + REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops); > RTX_FRAME_RELATED_P (insn) = 1; > + cfi_ops = NULL; > } > > - if (frame_size > 0) > - { > - if (need_barrier_p) > - emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx)); > + aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, initial_adjust, true); > > - aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, frame_size, true); > + if (cfi_ops) > + { > + /* Emit delayed restores and reset the CFA to be SP. */ > + insn = get_last_insn (); > + cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops); > + REG_NOTES (insn) = cfi_ops; > + RTX_FRAME_RELATED_P (insn) = 1; > } > > /* Stack adjustment for exception handler. */ > @@ -5173,18 +5146,18 @@ aarch64_initial_elimination_offset (unsigned from, > unsigned to) > if (to == HARD_FRAME_POINTER_REGNUM) > { > if (from == ARG_POINTER_REGNUM) > - return cfun->machine->frame.frame_size - crtl->outgoing_args_size; > + return cfun->machine->frame.hard_fp_offset; > > if (from == FRAME_POINTER_REGNUM) > - return (cfun->machine->frame.hard_fp_offset > - - cfun->machine->frame.saved_varargs_size); > + return cfun->machine->frame.hard_fp_offset > + - cfun->machine->frame.locals_offset; > } > > if (to == STACK_POINTER_REGNUM) > { > if (from == FRAME_POINTER_REGNUM) > - return (cfun->machine->frame.frame_size > - - cfun->machine->frame.saved_varargs_size); > + return cfun->machine->frame.frame_size > + - cfun->machine->frame.locals_offset; > } > > return cfun->machine->frame.frame_size; > diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_10.c > b/gcc/testsuite/gcc.target/aarch64/test_frame_10.c > index > 70dd6539af93a034ae64f8603089c6d6f59a6b53..e23a4a83528b71a0de0c95752a9e530bf4ca79e5 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/test_frame_10.c > +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_10.c > @@ -4,8 +4,7 @@ > * total frame size > 512. > area except outgoing <= 512 > * number of callee-saved reg >= 2. > - * Split stack adjustment into two subtractions. > - the first subtractions could be optimized into "stp !". */ > + * Use a single stack adjustment, no writeback. */ > > /* { dg-do run } */ > /* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */ > @@ -15,6 +14,6 @@ > t_frame_pattern_outgoing (test10, 480, "x19", 24, a[8], a[9], a[10]) > t_frame_run (test10) > > -/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!" > 1 } } */ > -/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\], \[0-9\]+" 1 > } } */ > +/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, \[0-9\]+\\\]" 1 > } } */ > +/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp, \[0-9\]+\\\]" 1 > } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_12.c > b/gcc/testsuite/gcc.target/aarch64/test_frame_12.c > index > 2353477c29ea99c56e73a34cf0449cf6c669e973..3d7d3594610c645d2d6f449b6ee0400fdd395849 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/test_frame_12.c > +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_12.c > @@ -13,6 +13,6 @@ t_frame_run (test12) > > /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */ > > -/* Check epilogue using write-back. */ > -/* { dg-final { scan-assembler-times "ldp\tx29, x30, \\\[sp\\\], \[0-9\]+" 3 > } } */ > +/* Check epilogue using no write-back. */ > +/* { dg-final { scan-assembler-times "ldp\tx29, x30, \\\[sp, \[0-9\]+\\\]" 1 > } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_13.c > b/gcc/testsuite/gcc.target/aarch64/test_frame_13.c > index > f3aa263929421db12b78abc733e2b011db3a4e48..74b3370fa463b652265e00fff80cc8856524d509 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/test_frame_13.c > +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_13.c > @@ -2,8 +2,7 @@ > * without outgoing. > * total frame size > 512. > * number of callee-save reg >= 2. > - * split the stack adjustment into two substractions, > - the second could be optimized into "stp !". */ > + * Use a single stack adjustment, no writeback. */ > > /* { dg-do run } */ > /* { dg-options "-O2 --save-temps" } */ > @@ -14,4 +13,4 @@ t_frame_pattern (test13, 700, ) > t_frame_run (test13) > > /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */ > -/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, -\[0-9\]+\\\]!" > 2 } } */ > +/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp\\\]" 1 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_15.c > b/gcc/testsuite/gcc.target/aarch64/test_frame_15.c > index > fc6f713232de52b72ba5c3eef92e1aea6526199d..bed6714b4fe529a3b81ad8c5253924aa97bf8806 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/test_frame_15.c > +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_15.c > @@ -3,8 +3,7 @@ > * total frame size > 512. > area except outgoing <= 512 > * number of callee-save reg >= 2. > - * split the stack adjustment into two substractions, > - the first could be optimized into "stp !". */ > + * Use a single stack adjustment, no writeback. */ > > /* { dg-do run } */ > /* { dg-options "-O2 --save-temps" } */ > @@ -15,4 +14,4 @@ t_frame_pattern_outgoing (test15, 480, , 8, a[8]) > t_frame_run (test15) > > /* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 1 } } */ > -/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, -\[0-9\]+\\\]!" > 3 } } */ > +/* { dg-final { scan-assembler-times "stp\tx29, x30, \\\[sp, \[0-9\]+\\\]" 1 > } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_16.c > b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..28f3826adadd5eaa6486659e4d6b6d7c5960b9d2 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c > @@ -0,0 +1,25 @@ > +/* Verify: > + * with outgoing. > + * single int register push. > + * varargs and callee-save size >= 256 > + * Use 2 stack adjustments. */ > + > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */ > + > +#define REP8(X) X,X,X,X,X,X,X,X > +#define REP64(X) REP8(REP8(X)) > + > +void outgoing (__builtin_va_list, ...); > + > +double vararg_outgoing (int x1, ...) > +{ > + double a1 = x1, a2 = x1 * 2, a3 = x1 * 3, a4 = x1 * 4, a5 = x1 * 5, a6 = > x1 * 6; > + __builtin_va_list vl; > + __builtin_va_start (vl, x1); > + outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1)); > + __builtin_va_end (vl); > + return a1 + a2 + a3 + a4 + a5 + a6; > +} > + > +/* { dg-final { scan-assembler-times "sub\tsp, sp, #\[0-9\]+" 2 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_6.c > b/gcc/testsuite/gcc.target/aarch64/test_frame_6.c > index > d8481346c58458934deecb4b7f38fb5821517b56..6a753dff87e28fa71a2f69df5fb95559163fa6cd > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/test_frame_6.c > +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_6.c > @@ -3,8 +3,7 @@ > * without outgoing. > * total frame size > 512. > * number of callee-saved reg == 1. > - * split stack adjustment into two subtractions. > - the second subtraction should use "str !". */ > + * use a single stack adjustment, no writeback. */ > > /* { dg-do run } */ > /* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */ > @@ -14,6 +13,7 @@ > t_frame_pattern (test6, 700, ) > t_frame_run (test6) > > -/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 2 } > } */ > -/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\], \[0-9\]+" 2 } } > */ > +/* { dg-final { scan-assembler-times "str\tx30, \\\[sp\\\]" 1 } } */ > +/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\]" 2 } } */ > +/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\]," 1 } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_7.c > b/gcc/testsuite/gcc.target/aarch64/test_frame_7.c > index > d87d68b3eec72dd23b279ea94391a400c9ae5a9a..f2a8713d19d9f7df49073e9588c5d74661491fb6 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/test_frame_7.c > +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_7.c > @@ -3,8 +3,7 @@ > * without outgoing. > * total frame size > 512. > * number of callee-saved reg == 2. > - * split stack adjustment into two subtractions. > - the second subtraction should use "stp !". */ > + * use a single stack adjustment, no writeback. */ > > /* { dg-do run } */ > /* { dg-options "-O2 -fomit-frame-pointer --save-temps" } */ > @@ -14,6 +13,6 @@ > t_frame_pattern (test7, 700, "x19") > t_frame_run (test7) > > -/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp, -\[0-9\]+\\\]!" > 1 } } */ > -/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\], \[0-9\]+" 1 > } } */ > +/* { dg-final { scan-assembler-times "stp\tx19, x30, \\\[sp]" 1 } } */ > +/* { dg-final { scan-assembler-times "ldp\tx19, x30, \\\[sp\\\]" 1 } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_8.c > b/gcc/testsuite/gcc.target/aarch64/test_frame_8.c > index > 435d9d59e68d71b1d4c56f1beca5fb1bce4f39b8..9b6c6939eb5c3ae1bdcab7fb854b6c519f054c20 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/test_frame_8.c > +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_8.c > @@ -12,6 +12,6 @@ > t_frame_pattern_outgoing (test8, 700, , 8, a[8]) > t_frame_run (test8) > > -/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, -\[0-9\]+\\\]!" 3 } > } */ > -/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp\\\], \[0-9\]+" 3 } } > */ > +/* { dg-final { scan-assembler-times "str\tx30, \\\[sp, \[0-9\]+\\\]" 1 } } > */ > +/* { dg-final { scan-assembler-times "ldr\tx30, \\\[sp, \[0-9\]+\\\]" 1 } } > */ >