On Thu, Apr 10, 2025 at 10:10 PM Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> PR119610 is about incorrect CFI output for a stack probe when that
> probe is not the initial allocation.  The main aarch64 stack probe
> function, aarch64_allocate_and_probe_stack_space, implicitly assumed
> that the incoming stack pointer pointed to the top of the frame,
> and thus held the CFA.
>
> aarch64_save_callee_saves and aarch64_restore_callee_saves use a
> parameter called bytes_below_sp to track how far the stack pointer
> is above the base of the static frame.  This patch does the same
> thing for aarch64_allocate_and_probe_stack_space.
>
> Also, I noticed that the SVE path was attaching the first CFA note
> to the wrong instruction: it was attaching the note to the calculation
> of the stack size, rather than to the r11<-sp copy.
>
> Bootstrapped & regression-tested on aarch64-linux-gnu.  I'll push on
> Monday if there are no comments before then.  I'd appreciate a second
> pair of eyes though, since this is a sensitive area.

Do you happen to know if the backports to older branches you provided for
the change that triggered this issue (in particular to GCC 7) are also affected?
The testcase from the PR does not fail in the reported way for me, but I guess
that very much depends on "details".

Thanks,
Richard.

> Richard
>
> gcc/
>         PR target/119160
>         * config/aarch64/aarch64.cc (aarch64_allocate_and_probe_stack_space):
>         Add a bytes_below_sp parameter and use it to calculate the CFA
>         offsets.  Attach the first SVE CFA note to the move into the
>         associated temporary register.
>         (aarch64_allocate_and_probe_stack_space): Update calls accordingly.
>         Start out with bytes_per_sp set to the frame size and decrement
>         it after each allocation.
>
> gcc/testsuite/
>         PR target/119160
>         * g++.dg/torture/pr119610.C: New test.
>         * g++.target/aarch64/sve/pr119610-sve.C: Likewise.
> ---
>  gcc/config/aarch64/aarch64.cc                 | 66 +++++++++++--------
>  gcc/testsuite/g++.dg/torture/pr119610.C       | 18 +++++
>  .../g++.target/aarch64/sve/pr119610-sve.C     | 20 ++++++
>  3 files changed, 78 insertions(+), 26 deletions(-)
>  create mode 100644 gcc/testsuite/g++.dg/torture/pr119610.C
>  create mode 100644 gcc/testsuite/g++.target/aarch64/sve/pr119610-sve.C
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 4e801146c60..1e415c1e78f 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -9417,13 +9417,16 @@ aarch64_emit_stack_tie (rtx reg)
>  }
>
>  /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
> -   registers.  If POLY_SIZE is not large enough to require a probe this 
> function
> -   will only adjust the stack.  When allocating the stack space
> -   FRAME_RELATED_P is then used to indicate if the allocation is frame 
> related.
> -   FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
> -   the saved registers.  If we are then we ensure that any allocation
> -   larger than the ABI defined buffer needs a probe so that the
> -   invariant of having a 1KB buffer is maintained.
> +   registers, given that the stack pointer is currently BYTES_BELOW_SP bytes
> +   above the bottom of the static frame.
> +
> +   If POLY_SIZE is not large enough to require a probe this function will 
> only
> +   adjust the stack.  When allocating the stack space FRAME_RELATED_P is then
> +   used to indicate if the allocation is frame related.  FINAL_ADJUSTMENT_P
> +   indicates whether we are allocating the area below the saved registers.
> +   If we are then we ensure that any allocation larger than the ABI defined
> +   buffer needs a probe so that the invariant of having a 1KB buffer is
> +   maintained.
>
>     We emit barriers after each stack adjustment to prevent optimizations from
>     breaking the invariant that we never drop the stack more than a page.  
> This
> @@ -9440,6 +9443,7 @@ aarch64_emit_stack_tie (rtx reg)
>  static void
>  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
>                                         poly_int64 poly_size,
> +                                       poly_int64 bytes_below_sp,
>                                         aarch64_isa_mode force_isa_mode,
>                                         bool frame_related_p,
>                                         bool final_adjustment_p)
> @@ -9503,8 +9507,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
> temp2,
>                           poly_size, temp1, temp2, force_isa_mode,
>                           false, true);
>
> -      rtx_insn *insn = get_last_insn ();
> -
> +      auto initial_cfa_offset = frame.frame_size - bytes_below_sp;
> +      auto final_cfa_offset = initial_cfa_offset + poly_size;
>        if (frame_related_p)
>         {
>           /* This is done to provide unwinding information for the stack
> @@ -9514,28 +9518,31 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, 
> rtx temp2,
>              The tie will expand to nothing but the optimizers will not touch
>              the instruction.  */
>           rtx stack_ptr_copy = gen_rtx_REG (Pmode, 
> STACK_CLASH_SVE_CFA_REGNUM);
> -         emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
> +         auto *insn = emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
>           aarch64_emit_stack_tie (stack_ptr_copy);
>
>           /* We want the CFA independent of the stack pointer for the
>              duration of the loop.  */
> -         add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
> +         add_reg_note (insn, REG_CFA_DEF_CFA,
> +                       plus_constant (Pmode, stack_ptr_copy,
> +                                      initial_cfa_offset));
>           RTX_FRAME_RELATED_P (insn) = 1;
>         }
>
>        rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
>        rtx guard_const = gen_int_mode (guard_size, Pmode);
>
> -      insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
> -                                                  stack_pointer_rtx, temp1,
> -                                                  probe_const, guard_const));
> +      auto *insn
> +       = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
> +                                               stack_pointer_rtx, temp1,
> +                                               probe_const, guard_const));
>
>        /* Now reset the CFA register if needed.  */
>        if (frame_related_p)
>         {
>           add_reg_note (insn, REG_CFA_DEF_CFA,
> -                       gen_rtx_PLUS (Pmode, stack_pointer_rtx,
> -                                     gen_int_mode (poly_size, Pmode)));
> +                       plus_constant (Pmode, stack_pointer_rtx,
> +                                      final_cfa_offset));
>           RTX_FRAME_RELATED_P (insn) = 1;
>         }
>
> @@ -9581,12 +9588,13 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, 
> rtx temp2,
>          We can determine which allocation we are doing by looking at
>          the value of FRAME_RELATED_P since the final allocations are not
>          frame related.  */
> +      auto cfa_offset = frame.frame_size - (bytes_below_sp - rounded_size);
>        if (frame_related_p)
>         {
>           /* We want the CFA independent of the stack pointer for the
>              duration of the loop.  */
>           add_reg_note (insn, REG_CFA_DEF_CFA,
> -                       plus_constant (Pmode, temp1, rounded_size));
> +                       plus_constant (Pmode, temp1, cfa_offset));
>           RTX_FRAME_RELATED_P (insn) = 1;
>         }
>
> @@ -9608,7 +9616,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx 
> temp2,
>        if (frame_related_p)
>         {
>           add_reg_note (insn, REG_CFA_DEF_CFA,
> -                       plus_constant (Pmode, stack_pointer_rtx, 
> rounded_size));
> +                       plus_constant (Pmode, stack_pointer_rtx, cfa_offset));
>           RTX_FRAME_RELATED_P (insn) = 1;
>         }
>
> @@ -9916,17 +9924,22 @@ aarch64_expand_prologue (void)
>       code below does not handle it for -fstack-clash-protection.  */
>    gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
>
> +  /* The offset of the current SP from the bottom of the static frame.  */
> +  poly_int64 bytes_below_sp = frame_size;
> +
>    /* Will only probe if the initial adjustment is larger than the guard
>       less the amount of the guard reserved for use by the caller's
>       outgoing args.  */
>    aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
> -                                         force_isa_mode, true, false);
> +                                         bytes_below_sp, force_isa_mode,
> +                                         true, false);
> +  bytes_below_sp -= initial_adjust;
>
>    if (callee_adjust != 0)
> -    aarch64_push_regs (reg1, reg2, callee_adjust);
> -
> -  /* The offset of the current SP from the bottom of the static frame.  */
> -  poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
> +    {
> +      aarch64_push_regs (reg1, reg2, callee_adjust);
> +      bytes_below_sp -= callee_adjust;
> +    }
>
>    if (emit_frame_chain)
>      {
> @@ -9994,7 +10007,7 @@ aarch64_expand_prologue (void)
>                   || known_eq (frame.reg_offset[VG_REGNUM], bytes_below_sp));
>        aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
>                                               sve_callee_adjust,
> -                                             force_isa_mode,
> +                                             bytes_below_sp, force_isa_mode,
>                                               !frame_pointer_needed, false);
>        bytes_below_sp -= sve_callee_adjust;
>      }
> @@ -10005,10 +10018,11 @@ aarch64_expand_prologue (void)
>
>    /* We may need to probe the final adjustment if it is larger than the guard
>       that is assumed by the called.  */
> -  gcc_assert (known_eq (bytes_below_sp, final_adjust));
>    aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
> -                                         force_isa_mode,
> +                                         bytes_below_sp, force_isa_mode,
>                                           !frame_pointer_needed, true);
> +  bytes_below_sp -= final_adjust;
> +  gcc_assert (known_eq (bytes_below_sp, 0));
>    if (emit_frame_chain && maybe_ne (final_adjust, 0))
>      aarch64_emit_stack_tie (hard_frame_pointer_rtx);
>
> diff --git a/gcc/testsuite/g++.dg/torture/pr119610.C 
> b/gcc/testsuite/g++.dg/torture/pr119610.C
> new file mode 100644
> index 00000000000..9998026c481
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/torture/pr119610.C
> @@ -0,0 +1,18 @@
> +// { dg-do run }
> +// { dg-additional-options "-fstack-protector-strong" { target 
> fstack_protector } }
> +// { dg-additional-options "-fstack-clash-protection" { target 
> supports_stack_clash_protection } }
> +
> +int *ptr;
> +void foo() {
> +  int c[1024*128];
> +  ptr = c;
> +  throw 1;
> +}
> +int main()
> +{
> +  try {
> +    foo();
> +  } catch(int x) {
> +    return 0;
> +  }
> +}
> diff --git a/gcc/testsuite/g++.target/aarch64/sve/pr119610-sve.C 
> b/gcc/testsuite/g++.target/aarch64/sve/pr119610-sve.C
> new file mode 100644
> index 00000000000..0044e512175
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/aarch64/sve/pr119610-sve.C
> @@ -0,0 +1,20 @@
> +// { dg-do run { target aarch64_sve_hw } }
> +// { dg-additional-options "-fstack-protector-strong" { target 
> fstack_protector } }
> +// { dg-additional-options "-fstack-clash-protection" { target 
> supports_stack_clash_protection } }
> +
> +void *a_ptr, *b_ptr;
> +void foo() {
> +  __SVInt32_t a;
> +  int b[1024*128];
> +  a_ptr = &a;
> +  b_ptr = b;
> +  throw 1;
> +}
> +int main()
> +{
> +  try {
> +    foo();
> +  } catch(int x) {
> +    return 0;
> +  }
> +}
> --
> 2.43.0
>

Reply via email to