Hi Richard, Here's the updated patch with all the feedback processed.
I have also run the compile tests through with -mabi=ilp32 as well. Ok for trunk? Thanks, Tamar The 09/27/2018 12:11, Richard Sandiford wrote: > > It turns out the testsuite didn't have a case in it which would cause a > > significant enough spill to enter the loop. After creating one I noticed a > > bug > > in the loop and fixed it. > > > > The loops are now > > > > .cfi_startproc > > mov x15, sp > > cntb x16, all, mul #11 > > add x16, x16, 304 > > .cfi_def_cfa_register 15 > > .SVLPSPL0: > > cmp x16, 61440 > > b.lt .SVLPEND0 > > sub sp, sp, 61440 > > str xzr, [sp, 0] > > subs x16, x16, 61440 > > (The code uses sub rather than subs here) > > > b .SVLPSPL0 > > .SVLPEND0: > > sub sp, sp, x16 > > .cfi_escape > > 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22 > > > > for a 64KB guard size. > > That's OK with me. Like you say, the main goal was to make the common > case of no probe as fast as possible. > > > diff --git a/gcc/config/aarch64/aarch64-protos.h > > b/gcc/config/aarch64/aarch64-protos.h > > index > > ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4 > > 100644 > > --- a/gcc/config/aarch64/aarch64-protos.h > > +++ b/gcc/config/aarch64/aarch64-protos.h > > @@ -453,6 +453,7 @@ void aarch64_asm_output_labelref (FILE *, const char *); > > void aarch64_cpu_cpp_builtins (cpp_reader *); > > const char * aarch64_gen_far_branch (rtx *, int, const char *, const char > > *); > > const char * aarch64_output_probe_stack_range (rtx, rtx); > > +const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx); > > void aarch64_err_no_fpadvsimd (machine_mode); > > void aarch64_expand_epilogue (bool); > > void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0); > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > > index > > d4b13d48d852a70848fc7c51fd867e776efb5e55..245fd6832ec0afe27c42a242c901a2e13024f935 > > 100644 > > --- a/gcc/config/aarch64/aarch64.c > > +++ b/gcc/config/aarch64/aarch64.c > > @@ -208,6 +208,7 @@ static bool aarch64_builtin_support_vector_misalignment > > (machine_mode mode, > > static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64); > > static bool aarch64_print_address_internal (FILE*, machine_mode, rtx, > > aarch64_addr_query_type); > > +static HOST_WIDE_INT aarch64_uimm12_nearest_value (HOST_WIDE_INT val); > > > /* Major revision number of the ARM Architecture implemented by the > > target. */ > > unsigned aarch64_architecture_version; > > @@ -3973,6 +3974,83 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2) > > return ""; > > } > > > +/* Emit the probe loop for doing stack clash probes and stack adjustments > > for > > + SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard > > size > > + of GUARD_SIZE. When a probe is emitted it is done at MIN_PROBE_OFFSET > > bytes > > + from the current BASE at an interval of MIN_PROBE_OFFSET. By the end > > of this > > MIN_PROBE_THRESHOLD in both cases (or rename the var to min_probe_offset, > either's fine). Probably "at most MIN_PROBE..." given the round down. > > > + function BASE = BASE - ADJUSTMENT. */ > > + > > +const char * > > +aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment, > > + rtx min_probe_threshold, rtx guard_size) > > +{ > > + /* This function is not allowed to use any instruction generation > > function > > + like gen_ and friends. If you do you'll likely ICE during CFG > > validation, > > + so instead emit the code you want using output_asm_insn. */ > > + gcc_assert (flag_stack_clash_protection); > > + gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P > > (guard_size)); > > + gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold)); > > + > > + /* The minimum required allocation before the residual requires probing. > > */ > > + HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold); > > + > > + /* Clamp the value down to the nearest value that can be used with a > > cmp. */ > > + residual_probe_guard = aarch64_uimm12_nearest_value > > (residual_probe_guard); > > Maybe aarch64_clamp_to_uimm12_shift or aarch64_round_down_to_uimm12_shift > would be better; nearest implies that "0x1ff0" should become "0x2000" > rather than "0x1000". > > > + /* ADJUSTMENT == RESIDUAL_PROBE_GUARD. */ > > + xops[0] = adjustment; > > + xops[1] = probe_offset_value_rtx; > > + output_asm_insn ("cmp\t%0, %1", xops); > > < rather than == (or just "Compare ...") > > > + /* Branch to end if not enough adjustment to probe. */ > > + fputs ("\tb.lt\t", asm_out_file); > > + assemble_name_raw (asm_out_file, loop_end_lab); > > + fputc ('\n', asm_out_file); > > + > > + /* BASE = BASE - RESIDUAL_PROBE_GUARD. */ > > + xops[0] = base; > > + xops[1] = gen_int_mode (residual_probe_guard, Pmode); > > probe_offset_value_rtx > > > + HOST_WIDE_INT size; > > + /* Handle the SVE non-constant case first. */ > > + if (!poly_size.is_constant (&size)) > > + { > > + > > Excess blank line. > > > + if (dump_file) > > + { > > + fprintf (dump_file, "Stack clash SVE prologue: "); > > + print_dec (poly_size, dump_file); > > + fprintf (dump_file, " bytes, dynamic probing will be required.\n"); > > + } > > + > > + /* First calculate the amount of bytes we're actually spilling. */ > > + aarch64_add_offset (Pmode, temp1, CONST0_RTX (GET_MODE (temp1)), > > Might as well use Pmode for the CONST0_RTX too, for consistency with the > first argument to aarch64_add_offset. > > > + poly_size, temp1, temp2, false, true); > > + > > + rtx_insn *insn = get_last_insn (); > > + > > + if (frame_related_p) > > + { > > + /* This is done to provide unwinding information for the stack > > + adjustments we're about to do, however to prevent the optimizers > > + from removing the R15 move and leaving the CFA note (which would be > > + very wrong) we tie the old and new stack pointer together. > > + The tie will expand to nothing but the optimizers will not touch > > + the instruction. */ > > + rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM); > > + emit_move_insn (stack_ptr_copy, stack_pointer_rtx); > > + emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx)); > > + > > + /* We want the CFA independent of the stack pointer for the > > + duration of the loop. */ > > + add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy); > > + RTX_FRAME_RELATED_P (insn) = 1; > > + } > > + > > + rtx probe_const = gen_int_mode (min_probe_threshold, DImode); > > + rtx guard_const = gen_int_mode (guard_size, DImode); > > Pmode in both cases. (No practical difference, but it makes everything > agree on the mode.) > > > if (dump_file) > > - fprintf (dump_file, > > - "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes" > > - ", probing will be required.\n", size); > > + { > > + fprintf (dump_file, > > + "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC > > + " bytes, probing will be required.\n", size); > > + } > > Not needed (previous formatting without { ... } was right). > > > +/* Returns the nearest value to VAL that will fit as a 12-bit unsigned > > immediate > > + that can be created with a left shift of 0 or 12. */ > > +static HOST_WIDE_INT > > +aarch64_uimm12_nearest_value (HOST_WIDE_INT val) > > +{ > > + if ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val) > > + return val; > > + > > + return val & (((HOST_WIDE_INT) 0xfff) << 12); > > +} > > Are these HOST_WIDE_INT casts needed? > > Probably worth asserting that (val & 0xffffff) == val, or handle > the case in which it isn't by returning 0xfff000. > > > +;; This instruction is used to generate the stack clash stack adjustment > > and > > +;; probing loop. We can't change the control flow during prologue and > > epilogue > > +;; code generation. So we must emit a volatile unspec and expand it later > > on. > > + > > +(define_insn "probe_sve_stack_clash" > > + [(set (match_operand:DI 0 "register_operand" "=rk") > > + (unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0") > > + (match_operand:DI 2 "register_operand" "r") > > + (match_operand:DI 3 "const_int_operand" "n") > > + (match_operand:DI 4 "aarch64_plus_immediate" "L")] > > + UNSPECV_PROBE_STACK_RANGE))] > > + "TARGET_SVE" > > +{ > > + return aarch64_output_probe_sve_stack_clash (operands[0], operands[2], > > + operands[3], operands[4]); > > +} > > + [(set_attr "length" "28")] > > +) > > Think this will break for ILP32. We probably need :P instead of :DI and > > "@probe_sve_stack_clash_<mode>" > > gen_probe_sve_stack_clash (Pmode, ...) > > > diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c > > b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c > > new file mode 100644 > > index > > 0000000000000000000000000000000000000000..6ea87392843e4b9561cf6d43ffee57887db62e4e > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c > > @@ -0,0 +1,30 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param > > stack-clash-protection-guard-size=16 -funwind-tables -ftree-vectorize" } */ > > +/* { dg-require-effective-target supports_stack_clash_protection } */ > > + > > +#include <stdint.h> > > + > > +#define N 20040 > > + > > +void __attribute__ ((noinline, noclone)) > > +test (int8_t *restrict dest, int8_t *restrict src) > > +{ > > + for (int i = 0; i < N; i+=8) > > + { > > + dest[i] += src[i * 4]; > > + dest[i+1] += src[i * 4 + 1]; > > + dest[i+2] += src[i * 4 + 2]; > > + dest[i+3] += src[i * 4 + 3]; > > + dest[i+4] += src[i * 4 + 4]; > > + dest[i+5] += src[i * 4 + 5]; > > + dest[i+6] += src[i * 4 + 6]; > > + dest[i+7] += src[i * 4 + 7]; > > + } > > +} > > I think we should use something that has a higher guarantee of > spilling, since we shouldn't really need to spill for the above. > See g++.target/aarch64/sve/catch_1.C for one possibility. > > > +/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */ > > +/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */ > > +/* { dg-final { scan-assembler-times {\.cfi_escape > > 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22} 1 } } */ > > + > > +/* Checks that the CFA notes are correct for every sp adjustment, but we > > also > > + need to make sure we can unwind correctly before the frame is set up. > > So > > + check that we're emitting r15 with a copy of sp an setting the CFA > > there. */ > > Think this comment belongs above the dg-finals -- seems odd to have it at > the end of the file. > > I'll take your word that the cfi_escape is correct, but it looks like > it matches the full calculation, including the VG multiple. It would > be better to leave out that part of the encoding, since the number of > SVE vectors spilled could vary quite easily. > > > diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c > > b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c > > new file mode 100644 > > index > > 0000000000000000000000000000000000000000..fd0e987597eba406fa7351433fe7157743aeca42 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c > > @@ -0,0 +1,32 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target supports_stack_clash_protection } */ > > +/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param > > stack-clash-protection-guard-size=16 -ftree-vectorize" } */ > > + > > + > > +#include <stdint.h> > > Excess blank line before include. > > > +#define N 20040 > > + > > +void __attribute__ ((noinline, noclone)) > > +test (int8_t *restrict dest, int8_t *restrict src) > > +{ > > + for (int i = 0; i < N; i+=8) > > + { > > + dest[i] += src[i * 4]; > > + dest[i+1] += src[i * 4 + 1]; > > + dest[i+2] += src[i * 4 + 2]; > > + dest[i+3] += src[i * 4 + 3]; > > + dest[i+4] += src[i * 4 + 4]; > > + dest[i+5] += src[i * 4 + 5]; > > + dest[i+6] += src[i * 4 + 6]; > > + dest[i+7] += src[i * 4 + 7]; > > + } > > +} > > + > > + > > +/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */ > > +/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */ > > +/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 1 } } > > */ > > + > > +/* SVE spill, requires probing as vector size is unknown at compile time. > > */ > > Same comments above forcing spilling and putting the comment before > the dg-finals. > > Thanks, > Richard --
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -453,6 +453,7 @@ void aarch64_asm_output_labelref (FILE *, const char *); void aarch64_cpu_cpp_builtins (cpp_reader *); const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *); const char * aarch64_output_probe_stack_range (rtx, rtx); +const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx); void aarch64_err_no_fpadvsimd (machine_mode); void aarch64_expand_epilogue (bool); void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0); diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index d4b13d48d852a70848fc7c51fd867e776efb5e55..8c901e9d8c00d392a2df62d9b63ce5b865b48e50 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -208,6 +208,7 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode, static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64); static bool aarch64_print_address_internal (FILE*, machine_mode, rtx, aarch64_addr_query_type); +static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val); /* Major revision number of the ARM Architecture implemented by the target. */ unsigned aarch64_architecture_version; @@ -3973,6 +3974,84 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2) return ""; } +/* Emit the probe loop for doing stack clash probes and stack adjustments for + SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size + of GUARD_SIZE. When a probe is emitted it is done at most + MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of + at most MIN_PROBE_THRESHOLD. By the end of this function + BASE = BASE - ADJUSTMENT. */ + +const char * +aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment, + rtx min_probe_threshold, rtx guard_size) +{ + /* This function is not allowed to use any instruction generation function + like gen_ and friends. If you do you'll likely ICE during CFG validation, + so instead emit the code you want using output_asm_insn. */ + gcc_assert (flag_stack_clash_protection); + gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size)); + gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold)); + + /* The minimum required allocation before the residual requires probing. */ + HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold); + + /* Clamp the value down to the nearest value that can be used with a cmp. */ + residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard); + rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode); + + gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard); + gcc_assert (aarch64_uimm12_shift (residual_probe_guard)); + + static int labelno = 0; + char loop_start_lab[32]; + char loop_end_lab[32]; + rtx xops[2]; + + ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno); + ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++); + + /* Emit loop start label. */ + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab); + + /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */ + xops[0] = adjustment; + xops[1] = probe_offset_value_rtx; + output_asm_insn ("cmp\t%0, %1", xops); + + /* Branch to end if not enough adjustment to probe. */ + fputs ("\tb.lt\t", asm_out_file); + assemble_name_raw (asm_out_file, loop_end_lab); + fputc ('\n', asm_out_file); + + /* BASE = BASE - RESIDUAL_PROBE_GUARD. */ + xops[0] = base; + xops[1] = probe_offset_value_rtx; + output_asm_insn ("sub\t%0, %0, %1", xops); + + /* Probe at BASE. */ + xops[1] = const0_rtx; + output_asm_insn ("str\txzr, [%0, %1]", xops); + + /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */ + xops[0] = adjustment; + xops[1] = probe_offset_value_rtx; + output_asm_insn ("sub\t%0, %0, %1", xops); + + /* Branch to start if still more bytes to allocate. */ + fputs ("\tb\t", asm_out_file); + assemble_name_raw (asm_out_file, loop_start_lab); + fputc ('\n', asm_out_file); + + /* No probe leave. */ + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab); + + /* BASE = BASE - ADJUSTMENT. */ + xops[0] = base; + xops[1] = adjustment; + output_asm_insn ("sub\t%0, %0, %1", xops); + return ""; +} + /* Determine whether a frame chain needs to be generated. */ static bool aarch64_needs_frame_chain (void) @@ -4835,21 +4914,73 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, } } - HOST_WIDE_INT size; /* If SIZE is not large enough to require probing, just adjust the stack and exit. */ - if (!poly_size.is_constant (&size) - || known_lt (poly_size, min_probe_threshold) + if (known_lt (poly_size, min_probe_threshold) || !flag_stack_clash_protection) { aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p); return; } + HOST_WIDE_INT size; + /* Handle the SVE non-constant case first. */ + if (!poly_size.is_constant (&size)) + { + if (dump_file) + { + fprintf (dump_file, "Stack clash SVE prologue: "); + print_dec (poly_size, dump_file); + fprintf (dump_file, " bytes, dynamic probing will be required.\n"); + } + + /* First calculate the amount of bytes we're actually spilling. */ + aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode), + poly_size, temp1, temp2, false, true); + + rtx_insn *insn = get_last_insn (); + + if (frame_related_p) + { + /* This is done to provide unwinding information for the stack + adjustments we're about to do, however to prevent the optimizers + from removing the R15 move and leaving the CFA note (which would be + very wrong) we tie the old and new stack pointer together. + The tie will expand to nothing but the optimizers will not touch + the instruction. */ + rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM); + emit_move_insn (stack_ptr_copy, stack_pointer_rtx); + emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx)); + + /* We want the CFA independent of the stack pointer for the + duration of the loop. */ + add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy); + RTX_FRAME_RELATED_P (insn) = 1; + } + + rtx probe_const = gen_int_mode (min_probe_threshold, Pmode); + rtx guard_const = gen_int_mode (guard_size, Pmode); + + insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx, + stack_pointer_rtx, temp1, + probe_const, guard_const)); + + /* Now reset the CFA register if needed. */ + if (frame_related_p) + { + add_reg_note (insn, REG_CFA_DEF_CFA, + gen_rtx_PLUS (Pmode, stack_pointer_rtx, + gen_int_mode (poly_size, Pmode))); + RTX_FRAME_RELATED_P (insn) = 1; + } + + return; + } + if (dump_file) fprintf (dump_file, - "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes" - ", probing will be required.\n", size); + "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC + " bytes, probing will be required.\n", size); /* Round size to the nearest multiple of guard_size, and calculate the residual as the difference between the original size and the rounded @@ -5458,6 +5589,20 @@ aarch64_uimm12_shift (HOST_WIDE_INT val) ); } +/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate + that can be created with a left shift of 0 or 12. */ +static HOST_WIDE_INT +aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val) +{ + /* Check to see if the value fits in 24 bits, as that is the maximum we can + handle correctly. */ + gcc_assert ((val & 0xffffff) == val); + + if (((val & 0xfff) << 0) == val) + return val; + + return val & (0xfff << 12); +} /* Return true if val is an immediate that can be loaded into a register by a MOVZ instruction. */ diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index b8da13f14fa9990e8fdc3c71ed407c8afc65a324..22eb026f0631958536ab0c33c4d234d0156dc120 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -6464,6 +6464,25 @@ [(set_attr "length" "32")] ) +;; This instruction is used to generate the stack clash stack adjustment and +;; probing loop. We can't change the control flow during prologue and epilogue +;; code generation. So we must emit a volatile unspec and expand it later on. + +(define_insn "@probe_sve_stack_clash_<mode>" + [(set (match_operand:P 0 "register_operand" "=rk") + (unspec_volatile:P [(match_operand:P 1 "register_operand" "0") + (match_operand:P 2 "register_operand" "r") + (match_operand:P 3 "const_int_operand" "n") + (match_operand:P 4 "aarch64_plus_immediate" "L")] + UNSPECV_PROBE_STACK_RANGE))] + "TARGET_SVE" +{ + return aarch64_output_probe_sve_stack_clash (operands[0], operands[2], + operands[3], operands[4]); +} + [(set_attr "length" "28")] +) + ;; Named pattern for expanding thread pointer reference. (define_expand "get_thread_pointerdi" [(match_operand:DI 0 "register_operand" "=r")] diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c new file mode 100644 index 0000000000000000000000000000000000000000..41579f26ba9156f3e500f090d132ba9cf28364d3 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fopenmp-simd -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -funwind-tables" } */ +/* { dg-require-effective-target supports_stack_clash_protection } */ + +#include "stack-check-prologue-16.c" + +/* Checks that the CFA notes are correct for every sp adjustment, but we also + need to make sure we can unwind correctly before the frame is set up. So + check that we're emitting r15 with a copy of sp an setting the CFA there. */ + +/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */ +/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */ +/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,.*} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c new file mode 100644 index 0000000000000000000000000000000000000000..d92ef47a57ddda556c563e36ad8aaf4acdeabc57 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target supports_stack_clash_protection } */ +/* { dg-options "-O3 -fopenmp-simd -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */ + +/* Invoke X (P##n) for n in [0, 7]. */ +#define REPEAT8(X, P) \ + X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7) + +/* Invoke X (n) for all octal n in [0, 39]. */ +#define REPEAT40(X) \ + REPEAT8 (X, 0) REPEAT8 (X, 1) REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4) + +/* Expect vector work to be done, with spilling of vector registers. */ +void +f2 (int x[40][100], int *y) +{ + /* Try to force some spilling. */ +#define DECLARE(N) int y##N = y[N]; + REPEAT40 (DECLARE); +#pragma omp simd + for (int i = 0; i < 100; ++i) + { +#define INC(N) x[N][i] += y##N; + REPEAT40 (INC); + } +} + +/* SVE spill, requires probing as vector size is unknown at compile time. */ + +/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */ +/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */ +/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c new file mode 100644 index 0000000000000000000000000000000000000000..68a9d5e3d2e74cb331dff0ef3bcd612f8bb0d0f2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24.c @@ -0,0 +1,48 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target supports_stack_clash_protection } */ +/* { dg-options "-O3 -fopenmp-simd -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */ + +#include <stdint.h> + +#define N 50 +#define S 2 * 64 * 1024 + +/* Invoke X (P##n) for n in [0, 9]. */ +#define REPEAT8(X, P) \ + X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7) \ + X (P##8) X (P##9) + +/* Invoke X (n) for all n in [0, 49]. */ +#define REPEAT50(X) \ + REPEAT8 (X, ) REPEAT8 (X, 1) REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4) + + /* Try to force some spilling. */ +#define DECLARE(N) int src##N = src[N * 4]; +#define INC(N) dest[i] += src##N; + +#define TEST_LOOP(NAME, TYPE) \ + void __attribute__ ((noinline, noclone, simd)) \ + NAME (TYPE *restrict dest, TYPE *restrict src) \ + { \ + REPEAT50 (DECLARE); \ + volatile char foo[S]; \ + foo[S-1]=1; \ + for (int i = 0; i < N; i++) \ + { \ + REPEAT50 (INC); \ + } \ + } + +#define TEST(NAME) \ + TEST_LOOP (NAME##_i32, int32_t) \ + TEST_LOOP (NAME##_i64, int64_t) \ + TEST_LOOP (NAME##_f32, float) \ + TEST_LOOP (NAME##_f64, double) + +TEST (test) + +/* Check the vectorized loop for stack clash probing. */ + +/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 4 } } */ +/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 4 } } */ +/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 4 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c new file mode 100644 index 0000000000000000000000000000000000000000..e764476faccded380102dfbc759be7cf6be88345 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_24_run.c @@ -0,0 +1,37 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-require-effective-target supports_stack_clash_protection } */ +/* { dg-options "-O3 -fopenmp-simd -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */ + +#include "struct_vect_24.c" + +#undef TEST_LOOP +#define TEST_LOOP(NAME, TYPE) \ + { \ + TYPE out[N]; \ + TYPE in[N * 4]; \ + for (int i = 0; i < N; ++i) \ + { \ + out[i] = i * 7 / 2; \ + asm volatile ("" ::: "memory"); \ + } \ + for (int i = 0; i < N * 4; ++i) \ + { \ + in[i] = i * 9 / 2; \ + asm volatile ("" ::: "memory"); \ + } \ + NAME (out, in); \ + for (int i = 0; i < N; ++i) \ + { \ + TYPE expected = i * 7 / 2; \ + if (out[i] != out[0] + expected) \ + __builtin_abort (); \ + asm volatile ("" ::: "memory"); \ + } \ + } + +int __attribute__ ((optimize (0))) +main (void) +{ + TEST (test); + return 0; +}