On 06/10/15 11:11, Eric Botcazou wrote: >> Thanks - I have no further comments on this patch. We probably need to >> implement the same on AArch64 too in order to avoid similar problems. > > Here's the implementation for aarch64, very similar but simpler since there > is > no shortage of scratch registers; the only thing to note is the new blockage > pattern. This was tested on real hardware but not with Linux, instead with > Darwin (experimental port of the toolchain to iOS) and makes it possible to > pass ACATS (Ada conformance testsuite which requires stack checking). > > There is also a couple of tweaks for the ARM implementation: a cosmetic one > for the probe_stack pattern and one for the output_probe_stack_range loop. > > > 2015-10-06 Tristan Gingold <ging...@adacore.com> > Eric Botcazou <ebotca...@adacore.com> > > PR middle-end/65958 > * config/aarch64/aarch64-protos.h (aarch64_output_probe_stack-range): > Declare. > * config/aarch64/aarch64.md: Declare UNSPECV_BLOCKAGE and > UNSPEC_PROBE_STACK_RANGE. > (blockage): New instruction. > (probe_stack_range): Likewise. > * config/aarch64/aarch64.c (aarch64_emit_probe_stack_range): New > function. > (aarch64_output_probe_stack_range): Likewise. > (aarch64_expand_prologue): Invoke aarch64_emit_probe_stack_range if > static builtin stack checking is enabled. > * config/aarch64/aarch64-linux.h (STACK_CHECK_STATIC_BUILTIN): > Define. > > * config/arm/arm.c (arm_emit_probe_stack_range): Adjust comment. > (output_probe_stack_range): Rotate the loop and simplify. > (thumb1_expand_prologue): Tweak sorry message. > * config/arm/arm.md (probe_stack): Use bare string. > > > 2015-10-06 Eric Botcazou <ebotca...@adacore.com> > > * gcc.target/aarch64/stack-checking.c: New test. >
Unless there really is common code between the two patches, this should be separated out into two posts, one for ARM and one for AArch64. More comments inline. > > pr65958-2.diff > > > Index: config/aarch64/aarch64-linux.h > =================================================================== > --- config/aarch64/aarch64-linux.h (revision 228512) > +++ config/aarch64/aarch64-linux.h (working copy) > @@ -88,4 +88,7 @@ > #undef TARGET_BINDS_LOCAL_P > #define TARGET_BINDS_LOCAL_P default_binds_local_p_2 > > +/* Define this to be nonzero if static stack checking is supported. */ > +#define STACK_CHECK_STATIC_BUILTIN 1 > + > #endif /* GCC_AARCH64_LINUX_H */ > Index: config/aarch64/aarch64-protos.h > =================================================================== > --- config/aarch64/aarch64-protos.h (revision 228512) > +++ config/aarch64/aarch64-protos.h (working copy) > @@ -316,6 +316,7 @@ void aarch64_asm_output_labelref (FILE * > void aarch64_cpu_cpp_builtins (cpp_reader *); > void aarch64_elf_asm_named_section (const char *, unsigned, tree); > const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *); > +const char * aarch64_output_probe_stack_range (rtx, rtx); > void aarch64_err_no_fpadvsimd (machine_mode, const char *); > void aarch64_expand_epilogue (bool); > void aarch64_expand_mov_immediate (rtx, rtx); > Index: config/aarch64/aarch64.c > =================================================================== > --- config/aarch64/aarch64.c (revision 228512) > +++ config/aarch64/aarch64.c (working copy) > @@ -76,6 +76,7 @@ > #include "sched-int.h" > #include "cortex-a57-fma-steering.h" > #include "target-globals.h" > +#include "common/common-target.h" > > /* This file should be included last. */ > #include "target-def.h" > @@ -2144,6 +2145,167 @@ aarch64_libgcc_cmp_return_mode (void) > return SImode; > } > > +#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP) > + > +#if (PROBE_INTERVAL % 4096) != 0 > +#error Cannot use indexed address calculation for stack probing > +#endif > + > +#if PROBE_INTERVAL > 4096 > +#error Cannot use indexed addressing mode for stack probing > +#endif > + Hmm, so if PROBE_INTERVAL != 4096 we barf! While that's safe and probably right for Linux, on some OSes there might be a minimum page size of 16k or even 64k. It would be nice if we could support that. > +/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE, > + inclusive. These are offsets from the current stack pointer. */ > + > +static void > +aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size) > +{ > + rtx reg9 = gen_rtx_REG (Pmode, 9); Ug! Manifest constants should be moved to pre-defines. PROBE_STACK_BASE_REG? > + > + /* The following code uses indexed address calculation on FIRST. */ > + gcc_assert ((first % 4096) == 0); where's 4096 come from? > + > + /* See if we have a constant small number of probes to generate. If so, > + that's the easy case. */ > + if (size <= PROBE_INTERVAL) > + { > + emit_set_insn (reg9, > + plus_constant (Pmode, stack_pointer_rtx, > + -(first + PROBE_INTERVAL))); > + emit_stack_probe (plus_constant (Pmode, reg9, PROBE_INTERVAL - size)); > + } > + > + /* The run-time loop is made up of 8 insns in the generic case while the > + compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. > */ > + else if (size <= 4 * PROBE_INTERVAL) > + { > + HOST_WIDE_INT i, rem; > + > + emit_set_insn (reg9, > + plus_constant (Pmode, stack_pointer_rtx, > + -(first + PROBE_INTERVAL))); > + emit_stack_probe (reg9); > + > + /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until > + it exceeds SIZE. If only two probes are needed, this will not > + generate any code. Then probe at FIRST + SIZE. */ > + for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL) > + { > + emit_set_insn (reg9, plus_constant (Pmode, reg9, -PROBE_INTERVAL)); > + emit_stack_probe (reg9); > + } > + > + rem = size - (i - PROBE_INTERVAL); > + if (rem > 256) > + { > + emit_set_insn (reg9, plus_constant (Pmode, reg9, -PROBE_INTERVAL)); > + emit_stack_probe (plus_constant (Pmode, reg9, PROBE_INTERVAL - rem)); > + } > + else > + emit_stack_probe (plus_constant (Pmode, reg9, -rem)); > + } > + > + /* Otherwise, do the same as above, but in a loop. Note that we must be > + extra careful with variables wrapping around because we might be at > + the very top (or the very bottom) of the address space and we have > + to be able to handle this case properly; in particular, we use an > + equality test for the loop condition. */ > + else > + { > + HOST_WIDE_INT rounded_size; > + rtx reg10 = gen_rtx_REG (Pmode, 10); More manifest constants. > + > + /* Step 1: round SIZE to the previous multiple of the interval. */ > + > + rounded_size = size & -PROBE_INTERVAL; > + > + > + /* Step 2: compute initial and final value of the loop counter. */ > + > + /* TEST_ADDR = SP + FIRST. */ > + emit_set_insn (reg9, > + plus_constant (Pmode, stack_pointer_rtx, -first)); > + > + /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */ > + emit_set_insn (reg10, > + plus_constant (Pmode, stack_pointer_rtx, > + -(first + rounded_size))); > + > + > + /* Step 3: the loop > + > + do > + { > + TEST_ADDR = TEST_ADDR + PROBE_INTERVAL > + probe at TEST_ADDR > + } > + while (TEST_ADDR != LAST_ADDR) > + > + probes at FIRST + N * PROBE_INTERVAL for values of N from 1 > + until it is equal to ROUNDED_SIZE. */ > + > + emit_insn (gen_probe_stack_range (reg9, reg9, reg10)); > + > + > + /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time > + that SIZE is equal to ROUNDED_SIZE. */ > + > + if (size != rounded_size) > + { > + HOST_WIDE_INT rem = size - rounded_size; > + > + if (rem > 256) > + { > + emit_set_insn (reg10, > + plus_constant (Pmode, reg10, -PROBE_INTERVAL)); > + emit_stack_probe (plus_constant (Pmode, reg10, > + PROBE_INTERVAL - rem)); > + } > + else > + emit_stack_probe (plus_constant (Pmode, reg10, -rem)); > + } > + } > + > + /* Make sure nothing is scheduled before we are done. */ > + emit_insn (gen_blockage ()); > +} > + > +/* Probe a range of stack addresses from REG1 to REG2 inclusive. These are > + absolute addresses. */ > + > +const char * > +aarch64_output_probe_stack_range (rtx reg1, rtx reg2) > +{ > + static int labelno = 0; > + char loop_lab[32]; > + rtx xops[2]; > + > + ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++); > + > + /* Loop. */ > + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab); > + > + /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */ > + xops[0] = reg1; > + xops[1] = GEN_INT (PROBE_INTERVAL); > + output_asm_insn ("sub\t%0, %0, %1", xops); > + > + /* Probe at TEST_ADDR. */ > + output_asm_insn ("str\txzr, [%0]", xops); > + > + /* Test if TEST_ADDR == LAST_ADDR. */ > + xops[1] = reg2; > + output_asm_insn ("cmp\t%0, %1", xops); > + > + /* Branch. */ > + fputs ("\tb.ne\t", asm_out_file); > + assemble_name_raw (asm_out_file, loop_lab); > + fputc ('\n', asm_out_file); > + > + return ""; > +} > + > static bool > aarch64_frame_pointer_required (void) > { > @@ -2544,6 +2706,18 @@ aarch64_expand_prologue (void) > if (flag_stack_usage_info) > current_function_static_stack_size = frame_size; > > + if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK) > + { > + if (crtl->is_leaf && !cfun->calls_alloca) > + { > + if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT) > + aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, > + frame_size - STACK_CHECK_PROTECT); > + } > + else if (frame_size > 0) > + aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size); > + } > + > /* Store pairs and load pairs have a range only -512 to 504. */ > if (offset >= 512) > { > Index: config/aarch64/aarch64.md > =================================================================== > --- config/aarch64/aarch64.md (revision 228512) > +++ config/aarch64/aarch64.md (working copy) > @@ -104,6 +104,7 @@ (define_c_enum "unspec" [ > UNSPEC_MB > UNSPEC_NOP > UNSPEC_PRLG_STK > + UNSPEC_PROBE_STACK_RANGE > UNSPEC_RBIT > UNSPEC_SISD_NEG > UNSPEC_SISD_SSHL > @@ -134,6 +135,7 @@ (define_c_enum "unspecv" [ > UNSPECV_SET_FPCR ; Represent assign of FPCR content. > UNSPECV_GET_FPSR ; Represent fetch of FPSR content. > UNSPECV_SET_FPSR ; Represent assign of FPSR content. > + UNSPECV_BLOCKAGE ; Represent a blockage > ] > ) > > @@ -4807,6 +4809,27 @@ (define_insn "stack_tie" > [(set_attr "length" "0")] > ) > > +;; UNSPEC_VOLATILE is considered to use and clobber all hard registers and > +;; all of memory. This blocks insns from being moved across this point. > + > +(define_insn "blockage" > + [(unspec_volatile [(const_int 0)] UNSPECV_BLOCKAGE)] > + "" > + "" > + [(set_attr "length" "0") > + (set_attr "type" "block")] > +) > + > +(define_insn "probe_stack_range" > + [(set (match_operand:DI 0 "register_operand" "=r") > + (unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0") > + (match_operand:DI 2 "register_operand" "r")] > + UNSPEC_PROBE_STACK_RANGE))] > + "" > +{ > + return aarch64_output_probe_stack_range (operands[0], operands[2]); > +}) > + This should be annotated with the sequence length. > ;; Named pattern for expanding thread pointer reference. > (define_expand "get_thread_pointerdi" > [(match_operand:DI 0 "register_operand" "=r")] > Index: config/arm/arm.c > =================================================================== > --- config/arm/arm.c (revision 228512) > +++ config/arm/arm.c (working copy) > @@ -21262,11 +21262,12 @@ arm_emit_probe_stack_range (HOST_WIDE_IN > > /* Step 3: the loop > > - while (TEST_ADDR != LAST_ADDR) > + do > { > TEST_ADDR = TEST_ADDR + PROBE_INTERVAL > probe at TEST_ADDR > } > + while (TEST_ADDR != LAST_ADDR) > > probes at FIRST + N * PROBE_INTERVAL for values of N from 1 > until it is equal to ROUNDED_SIZE. */ > @@ -21311,22 +21312,22 @@ output_probe_stack_range (rtx reg1, rtx > > ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++); > > + /* Loop. */ > ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab); > > - /* Test if TEST_ADDR == LAST_ADDR. */ > + /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */ > xops[0] = reg1; > - xops[1] = reg2; > - output_asm_insn ("cmp\t%0, %1", xops); > + xops[1] = GEN_INT (PROBE_INTERVAL); > + output_asm_insn ("sub\t%0, %0, %1", xops); > > - if (TARGET_THUMB2) > - fputs ("\tittt\tne\n", asm_out_file); > + /* Probe at TEST_ADDR. */ > + output_asm_insn ("str\tr0, [%0, #0]", xops); > > - /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */ > - xops[1] = GEN_INT (PROBE_INTERVAL); > - output_asm_insn ("subne\t%0, %0, %1", xops); > + /* Test if TEST_ADDR == LAST_ADDR. */ > + xops[1] = reg2; > + output_asm_insn ("cmp\t%0, %1", xops); > > - /* Probe at TEST_ADDR and branch. */ > - output_asm_insn ("strne\tr0, [%0, #0]", xops); > + /* Branch. */ > fputs ("\tbne\t", asm_out_file); > assemble_name_raw (asm_out_file, loop_lab); > fputc ('\n', asm_out_file); > @@ -24869,7 +24870,7 @@ thumb1_expand_prologue (void) > > /* If we have a frame, then do stack checking. FIXME: not implemented. */ > if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK && size) > - sorry ("-fstack-check=specific for THUMB1"); > + sorry ("-fstack-check=specific for Thumb-1"); > > amount = offsets->outgoing_args - offsets->saved_regs; > amount -= 4 * thumb1_extra_regs_pushed (offsets, true); > Index: config/arm/arm.md > =================================================================== > --- config/arm/arm.md (revision 228512) > +++ config/arm/arm.md (working copy) > @@ -8262,9 +8262,7 @@ (define_insn "probe_stack" > [(set (match_operand 0 "memory_operand" "=m") > (unspec [(const_int 0)] UNSPEC_PROBE_STACK))] > "TARGET_32BIT" > -{ > - return "str%?\\tr0, %0"; > -} > + "str%?\\tr0, %0" > [(set_attr "type" "store1") > (set_attr "predicable" "yes")] > ) > > > stack-checking.c > > > /* { dg-do run { target { *-*-linux* } } } */ > /* { dg-options "-fstack-check" } */ > > int main(void) > { > char *p; > if (1) > { > char i[48]; > p = __builtin_alloca(8); > p[0] = 1; > } > > if (1) > { > char i[48], j[64]; > j[48] = 0; > } > > return !p[0]; > } > R.