Hi,
the stack clash protection mechanism in the x86 back-end was implemented by
largely duplicating the existing stack checking implementation. Now the only
significant difference between them is the probing window, which is shifted by
1 probing interval (not 2 as documented in explow.c), but we can certainly do
1 more probe for stack checking even if it is redundant in almost all cases.
Tested on x86-64/Linux, OK for the mainline?
2020-07-15 Eric Botcazou <ebotca...@adacore.com>
* config/i386/i386.c (ix86_compute_frame_layout): Minor tweak.
(ix86_adjust_stack_and_probe): Delete.
(ix86_adjust_stack_and_probe_stack_clash): Rename to above and add
PROTECTION_AREA parameter. If it is true, probe PROBE_INTERVAL plus
a small dope beyond SIZE bytes.
(ix86_emit_probe_stack_range): Use local variable.
(ix86_expand_prologue): Adjust calls to ix86_adjust_stack_and_probe
and tidy up the stack checking code.
* explow.c (get_stack_check_protect): Fix head comment.
(anti_adjust_stack_and_probe_stack_clash): Likewise.
(allocate_dynamic_stack_space): Add comment.
--
Eric Botcazou
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5c373c091ce..31757b044c8 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -6169,10 +6169,7 @@ ix86_compute_frame_layout (void)
}
frame->save_regs_using_mov
- = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
- /* If static stack checking is enabled and done with probes,
- the registers need to be saved before allocating the frame. */
- && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
+ = TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue;
/* Skip return address and error code in exception handler. */
offset = INCOMING_FRAME_SP_OFFSET;
@@ -6329,6 +6326,9 @@ ix86_compute_frame_layout (void)
if ((!to_allocate && frame->nregs <= 1)
|| (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
+ /* If static stack checking is enabled and done with probes,
+ the registers need to be saved before allocating the frame. */
+ || flag_stack_check == STATIC_BUILTIN_STACK_CHECK
/* If stack clash probing needs a loop, then it needs a
scratch register. But the returned register is only guaranteed
to be safe to use after register saves are complete. So if
@@ -7122,17 +7122,20 @@ release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset,
/* Emit code to adjust the stack pointer by SIZE bytes while probing it.
- This differs from the next routine in that it tries hard to prevent
- attacks that jump the stack guard. Thus it is never allowed to allocate
- more than PROBE_INTERVAL bytes of stack space without a suitable
- probe.
+ If INT_REGISTERS_SAVED is true, then integer registers have already been
+ pushed on the stack.
- INT_REGISTERS_SAVED is true if integer registers have already been
- pushed on the stack. */
+ If PROTECTION AREA is true, then probe PROBE_INTERVAL plus a small dope
+ beyond SIZE bytes.
+
+ This assumes no knowledge of the current probing state, i.e. it is never
+ allowed to allocate more than PROBE_INTERVAL bytes of stack space without
+ a suitable probe. */
static void
-ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
- const bool int_registers_saved)
+ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
+ const bool int_registers_saved,
+ const bool protection_area)
{
struct machine_function *m = cfun->machine;
@@ -7194,10 +7197,17 @@ ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
emit_insn (gen_blockage ());
}
+ const HOST_WIDE_INT probe_interval = get_probe_interval ();
+ const int dope = 4 * UNITS_PER_WORD;
+
+ /* If there is protection area, take it into account in the size. */
+ if (protection_area)
+ size += probe_interval + dope;
+
/* If we allocate less than the size of the guard statically,
then no probing is necessary, but we do need to allocate
the stack. */
- if (size < (1 << param_stack_clash_protection_guard_size))
+ else if (size < (1 << param_stack_clash_protection_guard_size))
{
pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
GEN_INT (-size), -1,
@@ -7209,7 +7219,6 @@ ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
/* We're allocating a large enough stack frame that we need to
emit probes. Either emit them inline or in a loop depending
on the size. */
- HOST_WIDE_INT probe_interval = get_probe_interval ();
if (size <= 4 * probe_interval)
{
HOST_WIDE_INT i;
@@ -7228,12 +7237,19 @@ ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
}
/* We need to allocate space for the residual, but we do not need
- to probe the residual. */
+ to probe the residual... */
HOST_WIDE_INT residual = (i - probe_interval - size);
if (residual)
- pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (residual), -1,
- m->fs.cfa_reg == stack_pointer_rtx);
+ {
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (residual), -1,
+ m->fs.cfa_reg == stack_pointer_rtx);
+
+ /* ...except if there is a protection area to maintain. */
+ if (protection_area)
+ emit_stack_probe (stack_pointer_rtx);
+ }
+
dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
}
else
@@ -7296,186 +7312,27 @@ ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
is equal to ROUNDED_SIZE. */
if (size != rounded_size)
- pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (rounded_size - size), -1,
- m->fs.cfa_reg == stack_pointer_rtx);
- dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
-
- /* This does not deallocate the space reserved for the scratch
- register. That will be deallocated in the epilogue. */
- release_scratch_register_on_entry (&sr, size, false);
- }
-
- /* Make sure nothing is scheduled before we are done. */
- emit_insn (gen_blockage ());
-}
-
-/* Emit code to adjust the stack pointer by SIZE bytes while probing it.
-
- INT_REGISTERS_SAVED is true if integer registers have already been
- pushed on the stack. */
-
-static void
-ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
- const bool int_registers_saved)
-{
- /* We skip the probe for the first interval + a small dope of 4 words and
- probe that many bytes past the specified size to maintain a protection
- area at the botton of the stack. */
- const int dope = 4 * UNITS_PER_WORD;
- rtx size_rtx = GEN_INT (size), last;
-
- /* See if we have a constant small number of probes to generate. If so,
- that's the easy case. The run-time loop is made up of 9 insns in the
- generic case while the compile-time loop is made up of 3+2*(n-1) insns
- for n # of intervals. */
- if (size <= 4 * get_probe_interval ())
- {
- HOST_WIDE_INT i, adjust;
- bool first_probe = true;
-
- /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
- values of N from 1 until it exceeds SIZE. If only one probe is
- needed, this will not generate any code. Then adjust and probe
- to PROBE_INTERVAL + SIZE. */
- for (i = get_probe_interval (); i < size; i += get_probe_interval ())
{
- if (first_probe)
- {
- adjust = 2 * get_probe_interval () + dope;
- first_probe = false;
- }
- else
- adjust = get_probe_interval ();
-
- emit_insn (gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- -adjust)));
- emit_stack_probe (stack_pointer_rtx);
- }
-
- if (first_probe)
- adjust = size + get_probe_interval () + dope;
- else
- adjust = size + get_probe_interval () - i;
-
- emit_insn (gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- -adjust)));
- emit_stack_probe (stack_pointer_rtx);
-
- /* Adjust back to account for the additional first interval. */
- last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- (get_probe_interval ()
- + dope))));
- }
-
- /* Otherwise, do the same as above, but in a loop. Note that we must be
- extra careful with variables wrapping around because we might be at
- the very top (or the very bottom) of the address space and we have
- to be able to handle this case properly; in particular, we use an
- equality test for the loop condition. */
- else
- {
- /* We expect the GP registers to be saved when probes are used
- as the probing sequences might need a scratch register and
- the routine to allocate one assumes the integer registers
- have already been saved. */
- gcc_assert (int_registers_saved);
-
- HOST_WIDE_INT rounded_size;
- struct scratch_reg sr;
-
- get_scratch_register_on_entry (&sr);
-
- /* If we needed to save a register, then account for any space
- that was pushed (we are not going to pop the register when
- we do the restore). */
- if (sr.saved)
- size -= UNITS_PER_WORD;
-
- /* Step 1: round SIZE to the previous multiple of the interval. */
-
- rounded_size = ROUND_DOWN (size, get_probe_interval ());
-
-
- /* Step 2: compute initial and final value of the loop counter. */
-
- /* SP = SP_0 + PROBE_INTERVAL. */
- emit_insn (gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- - (get_probe_interval () + dope))));
-
- /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
- if (rounded_size <= (HOST_WIDE_INT_1 << 31))
- emit_insn (gen_rtx_SET (sr.reg,
- plus_constant (Pmode, stack_pointer_rtx,
- -rounded_size)));
- else
- {
- emit_move_insn (sr.reg, GEN_INT (-rounded_size));
- emit_insn (gen_rtx_SET (sr.reg,
- gen_rtx_PLUS (Pmode, sr.reg,
- stack_pointer_rtx)));
- }
-
-
- /* Step 3: the loop
-
- do
- {
- SP = SP + PROBE_INTERVAL
- probe at SP
- }
- while (SP != LAST_ADDR)
-
- adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
- values of N from 1 until it is equal to ROUNDED_SIZE. */
-
- emit_insn (gen_adjust_stack_and_probe (Pmode, sr.reg, sr.reg, size_rtx));
-
-
- /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
- assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (rounded_size - size), -1,
+ m->fs.cfa_reg == stack_pointer_rtx);
- if (size != rounded_size)
- {
- emit_insn (gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- rounded_size - size)));
- emit_stack_probe (stack_pointer_rtx);
+ if (protection_area)
+ emit_stack_probe (stack_pointer_rtx);
}
- /* Adjust back to account for the additional first interval. */
- last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- (get_probe_interval ()
- + dope))));
+ dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
/* This does not deallocate the space reserved for the scratch
register. That will be deallocated in the epilogue. */
release_scratch_register_on_entry (&sr, size, false);
}
- /* Even if the stack pointer isn't the CFA register, we need to correctly
- describe the adjustments made to it, in particular differentiate the
- frame-related ones from the frame-unrelated ones. */
- if (size > 0)
- {
- rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
- XVECEXP (expr, 0, 0)
- = gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx, -size));
- XVECEXP (expr, 0, 1)
- = gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- get_probe_interval () + dope + size));
- add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
- RTX_FRAME_RELATED_P (last) = 1;
-
- cfun->machine->fs.sp_offset += size;
- }
+ /* Adjust back to account for the protection area. */
+ if (protection_area)
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (probe_interval + dope), -1,
+ m->fs.cfa_reg == stack_pointer_rtx);
/* Make sure nothing is scheduled before we are done. */
emit_insn (gen_blockage ());
@@ -7527,18 +7384,20 @@ static void
ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
const bool int_registers_saved)
{
+ const HOST_WIDE_INT probe_interval = get_probe_interval ();
+
/* See if we have a constant small number of probes to generate. If so,
that's the easy case. The run-time loop is made up of 6 insns in the
generic case while the compile-time loop is made up of n insns for n #
of intervals. */
- if (size <= 6 * get_probe_interval ())
+ if (size <= 6 * probe_interval)
{
HOST_WIDE_INT i;
/* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
it exceeds SIZE. If only one probe is needed, this will not
generate any code. Then probe at FIRST + SIZE. */
- for (i = get_probe_interval (); i < size; i += get_probe_interval ())
+ for (i = probe_interval; i < size; i += probe_interval)
emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
-(first + i)));
@@ -7567,7 +7426,7 @@ ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
/* Step 1: round SIZE to the previous multiple of the interval. */
- rounded_size = ROUND_DOWN (size, get_probe_interval ());
+ rounded_size = ROUND_DOWN (size, probe_interval);
/* Step 2: compute initial and final value of the loop counter. */
@@ -8324,27 +8183,33 @@ ix86_expand_prologue (void)
sse_registers_saved = true;
}
+ /* If stack clash protection is requested, then probe the stack. */
+ if (allocate >= 0 && flag_stack_clash_protection)
+ {
+ ix86_adjust_stack_and_probe (allocate, int_registers_saved, false);
+ allocate = 0;
+ }
+
/* The stack has already been decremented by the instruction calling us
so probe if the size is non-negative to preserve the protection area. */
- if (allocate >= 0
- && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
- || flag_stack_clash_protection))
+ else if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
{
- if (flag_stack_clash_protection)
- {
- ix86_adjust_stack_and_probe_stack_clash (allocate,
- int_registers_saved);
- allocate = 0;
- }
- else if (STACK_CHECK_MOVING_SP)
+ const HOST_WIDE_INT probe_interval = get_probe_interval ();
+
+ if (STACK_CHECK_MOVING_SP)
{
- if (!(crtl->is_leaf && !cfun->calls_alloca
- && allocate <= get_probe_interval ()))
+ if (crtl->is_leaf
+ && !cfun->calls_alloca
+ && allocate <= probe_interval)
+ ;
+
+ else
{
- ix86_adjust_stack_and_probe (allocate, int_registers_saved);
+ ix86_adjust_stack_and_probe (allocate, int_registers_saved, true);
allocate = 0;
}
}
+
else
{
HOST_WIDE_INT size = allocate;
@@ -8356,7 +8221,7 @@ ix86_expand_prologue (void)
{
if (crtl->is_leaf && !cfun->calls_alloca)
{
- if (size > get_probe_interval ())
+ if (size > probe_interval)
ix86_emit_probe_stack_range (0, size, int_registers_saved);
}
else
@@ -8368,7 +8233,7 @@ ix86_expand_prologue (void)
{
if (crtl->is_leaf && !cfun->calls_alloca)
{
- if (size > get_probe_interval ()
+ if (size > probe_interval
&& size > get_stack_check_protect ())
ix86_emit_probe_stack_range (get_stack_check_protect (),
(size
diff --git a/gcc/explow.c b/gcc/explow.c
index 15c9cfb0318..0fbc6d25b81 100644
--- a/gcc/explow.c
+++ b/gcc/explow.c
@@ -1293,9 +1293,9 @@ get_dynamic_stack_size (rtx *psize, unsigned size_align,
/* Return the number of bytes to "protect" on the stack for -fstack-check.
- "protect" in the context of -fstack-check means how many bytes we
- should always ensure are available on the stack. More importantly
- this is how many bytes are skipped when probing the stack.
+ "protect" in the context of -fstack-check means how many bytes we need
+ to always ensure are available on the stack; as a consequence, this is
+ also how many bytes are first skipped when probing the stack.
On some targets we want to reuse the -fstack-check prologue support
to give a degree of protection against stack clashing style attacks.
@@ -1303,14 +1303,16 @@ get_dynamic_stack_size (rtx *psize, unsigned size_align,
In that scenario we do not want to skip bytes before probing as that
would render the stack clash protections useless.
- So we never use STACK_CHECK_PROTECT directly. Instead we indirect though
- this helper which allows us to provide different values for
- -fstack-check and -fstack-clash-protection. */
+ So we never use STACK_CHECK_PROTECT directly. Instead we indirectly
+ use it through this helper, which allows to provide different values
+ for -fstack-check and -fstack-clash-protection. */
+
HOST_WIDE_INT
get_stack_check_protect (void)
{
if (flag_stack_clash_protection)
return 0;
+
return STACK_CHECK_PROTECT;
}
@@ -1532,6 +1534,8 @@ allocate_dynamic_stack_space (rtx size, unsigned size_align,
saved_stack_pointer_delta = stack_pointer_delta;
+ /* If stack checking or stack clash protection is requested,
+ then probe the stack while allocating space from it. */
if (flag_stack_check && STACK_CHECK_MOVING_SP)
anti_adjust_stack_and_probe (size, false);
else if (flag_stack_clash_protection)
@@ -1940,8 +1944,8 @@ emit_stack_clash_protection_probe_loop_end (rtx loop_lab, rtx end_loop,
probes were not emitted.
2. It never skips probes, whereas anti_adjust_stack_and_probe will
- skip probes on the first couple PROBE_INTERVALs on the assumption
- they're done elsewhere.
+ skip the probe on the first PROBE_INTERVAL on the assumption it
+ was already done in the prologue and in previous allocations.
3. It only allocates and probes SIZE bytes, it does not need to
allocate/probe beyond that because this probing style does not