> It turns out the testsuite didn't have a case in it which would cause a
> significant enough spill to enter the loop.  After creating one I noticed a 
> bug
> in the loop and fixed it.
>
> The loops are now
>
>         .cfi_startproc
>         mov     x15, sp
>         cntb    x16, all, mul #11
>         add     x16, x16, 304
>         .cfi_def_cfa_register 15
> .SVLPSPL0:
>         cmp     x16, 61440
>         b.lt    .SVLPEND0
>         sub     sp, sp, 61440
>         str     xzr, [sp, 0]
>         subs    x16, x16, 61440

(The code uses sub rather than subs here)

>         b      .SVLPSPL0
> .SVLPEND0:
>         sub     sp, sp, x16
>         .cfi_escape 
> 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22
>
> for a 64KB guard size.

That's OK with me.  Like you say, the main goal was to make the common
case of no probe as fast as possible.

> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 
> ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4
>  100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -453,6 +453,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
>  void aarch64_cpu_cpp_builtins (cpp_reader *);
>  const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
>  const char * aarch64_output_probe_stack_range (rtx, rtx);
> +const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
>  void aarch64_err_no_fpadvsimd (machine_mode);
>  void aarch64_expand_epilogue (bool);
>  void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> d4b13d48d852a70848fc7c51fd867e776efb5e55..245fd6832ec0afe27c42a242c901a2e13024f935
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -208,6 +208,7 @@ static bool aarch64_builtin_support_vector_misalignment 
> (machine_mode mode,
>  static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
>  static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
>                                           aarch64_addr_query_type);
> +static HOST_WIDE_INT aarch64_uimm12_nearest_value (HOST_WIDE_INT val);
 
>  /* Major revision number of the ARM Architecture implemented by the target.  
> */
>  unsigned aarch64_architecture_version;
> @@ -3973,6 +3974,83 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
>    return "";
>  }
 
> +/* Emit the probe loop for doing stack clash probes and stack adjustments for
> +   SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard 
> size
> +   of GUARD_SIZE.  When a probe is emitted it is done at MIN_PROBE_OFFSET 
> bytes
> +   from the current BASE at an interval of MIN_PROBE_OFFSET.  By the end of 
> this

MIN_PROBE_THRESHOLD in both cases (or rename the var to min_probe_offset,
either's fine).  Probably "at most MIN_PROBE..." given the round down.

> +   function BASE = BASE - ADJUSTMENT.  */
> +
> +const char *
> +aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
> +                                   rtx min_probe_threshold, rtx guard_size)
> +{
> +  /* This function is not allowed to use any instruction generation function
> +     like gen_ and friends.  If you do you'll likely ICE during CFG 
> validation,
> +     so instead emit the code you want using output_asm_insn.  */
> +  gcc_assert (flag_stack_clash_protection);
> +  gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
> +  gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
> +
> +  /* The minimum required allocation before the residual requires probing.  
> */
> +  HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
> +
> +  /* Clamp the value down to the nearest value that can be used with a cmp.  
> */
> +  residual_probe_guard = aarch64_uimm12_nearest_value (residual_probe_guard);

Maybe aarch64_clamp_to_uimm12_shift or aarch64_round_down_to_uimm12_shift
would be better; nearest implies that "0x1ff0" should become "0x2000"
rather than "0x1000".

> +  /* ADJUSTMENT == RESIDUAL_PROBE_GUARD.  */
> +  xops[0] = adjustment;
> +  xops[1] = probe_offset_value_rtx;
> +  output_asm_insn ("cmp\t%0, %1", xops);

< rather than == (or just "Compare ...")

> +  /* Branch to end if not enough adjustment to probe.  */
> +  fputs ("\tb.lt\t", asm_out_file);
> +  assemble_name_raw (asm_out_file, loop_end_lab);
> +  fputc ('\n', asm_out_file);
> +
> +  /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
> +  xops[0] = base;
> +  xops[1] = gen_int_mode (residual_probe_guard, Pmode);

probe_offset_value_rtx

> +  HOST_WIDE_INT size;
> +  /* Handle the SVE non-constant case first.  */
> +  if (!poly_size.is_constant (&size))
> +    {
> +

Excess blank line.

> +     if (dump_file)
> +      {
> +       fprintf (dump_file, "Stack clash SVE prologue: ");
> +       print_dec (poly_size, dump_file);
> +       fprintf (dump_file, " bytes, dynamic probing will be required.\n");
> +      }
> +
> +      /* First calculate the amount of bytes we're actually spilling.  */
> +      aarch64_add_offset (Pmode, temp1, CONST0_RTX (GET_MODE (temp1)),

Might as well use Pmode for the CONST0_RTX too, for consistency with the
first argument to aarch64_add_offset.

> +                       poly_size, temp1, temp2, false, true);
> +
> +      rtx_insn *insn = get_last_insn ();
> +
> +      if (frame_related_p)
> +     {
> +       /* This is done to provide unwinding information for the stack
> +          adjustments we're about to do, however to prevent the optimizers
> +          from removing the R15 move and leaving the CFA note (which would be
> +          very wrong) we tie the old and new stack pointer together.
> +          The tie will expand to nothing but the optimizers will not touch
> +          the instruction.  */
> +       rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
> +       emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
> +       emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
> +
> +       /* We want the CFA independent of the stack pointer for the
> +          duration of the loop.  */
> +       add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
> +       RTX_FRAME_RELATED_P (insn) = 1;
> +     }
> +
> +      rtx probe_const = gen_int_mode (min_probe_threshold, DImode);
> +      rtx guard_const = gen_int_mode (guard_size, DImode);

Pmode in both cases.  (No practical difference, but it makes everything
agree on the mode.)

>    if (dump_file)
> -    fprintf (dump_file,
> -          "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
> -          ", probing will be required.\n", size);
> +    {
> +      fprintf (dump_file,
> +            "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
> +            " bytes, probing will be required.\n", size);
> +    }

Not needed (previous formatting without { ... } was right).

> +/* Returns the nearest value to VAL that will fit as a 12-bit unsigned 
> immediate
> +   that can be created with a left shift of 0 or 12.  */
> +static HOST_WIDE_INT
> +aarch64_uimm12_nearest_value (HOST_WIDE_INT val)
> +{
> +  if ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val)
> +    return val;
> +
> +  return val & (((HOST_WIDE_INT) 0xfff) << 12);
> +}

Are these HOST_WIDE_INT casts needed?

Probably worth asserting that (val & 0xffffff) == val, or handle
the case in which it isn't by returning 0xfff000.
 
> +;; This instruction is used to generate the stack clash stack adjustment and
> +;; probing loop.  We can't change the control flow during prologue and 
> epilogue
> +;; code generation.  So we must emit a volatile unspec and expand it later 
> on.
> +
> +(define_insn "probe_sve_stack_clash"
> +  [(set (match_operand:DI 0 "register_operand" "=rk")
> +     (unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
> +                          (match_operand:DI 2 "register_operand" "r")
> +                          (match_operand:DI 3 "const_int_operand" "n")
> +                          (match_operand:DI 4 "aarch64_plus_immediate" "L")]
> +                           UNSPECV_PROBE_STACK_RANGE))]
> +  "TARGET_SVE"
> +{
> +  return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
> +                                            operands[3], operands[4]);
> +}
> +  [(set_attr "length" "28")]
> +)

Think this will break for ILP32.  We probably need :P instead of :DI and

  "@probe_sve_stack_clash_<mode>"

  gen_probe_sve_stack_clash (Pmode, ...)

> diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c 
> b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..6ea87392843e4b9561cf6d43ffee57887db62e4e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param 
> stack-clash-protection-guard-size=16 -funwind-tables -ftree-vectorize" } */
> +/* { dg-require-effective-target supports_stack_clash_protection } */
> +
> +#include <stdint.h>
> +
> +#define N 20040
> +
> +void __attribute__ ((noinline, noclone))
> +test (int8_t *restrict dest, int8_t *restrict src)
> +{
> +  for (int i = 0; i < N; i+=8)
> +    {
> +      dest[i] += src[i * 4];
> +      dest[i+1] += src[i * 4 + 1];
> +      dest[i+2] += src[i * 4 + 2];
> +      dest[i+3] += src[i * 4 + 3];
> +      dest[i+4] += src[i * 4 + 4];
> +      dest[i+5] += src[i * 4 + 5];
> +      dest[i+6] += src[i * 4 + 6];
> +      dest[i+7] += src[i * 4 + 7];
> +    }
> +}

I think we should use something that has a higher guarantee of
spilling, since we shouldn't really need to spill for the above.
See g++.target/aarch64/sve/catch_1.C for one possibility.

> +/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
> +/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
> +/* { dg-final { scan-assembler-times {\.cfi_escape 
> 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22} 1 } } */
> +
> +/* Checks that the CFA notes are correct for every sp adjustment, but we also
> +   need to make sure we can unwind correctly before the frame is set up.  So
> +   check that we're emitting r15 with a copy of sp an setting the CFA there. 
>  */

Think this comment belongs above the dg-finals -- seems odd to have it at
the end of the file.

I'll take your word that the cfi_escape is correct, but it looks like
it matches the full calculation, including the VG multiple.  It would
be better to leave out that part of the encoding, since the number of
SVE vectors spilled could vary quite easily.

> diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c 
> b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..fd0e987597eba406fa7351433fe7157743aeca42
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-16.c
> @@ -0,0 +1,32 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target supports_stack_clash_protection } */
> +/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param 
> stack-clash-protection-guard-size=16 -ftree-vectorize" } */
> +
> +
> +#include <stdint.h>

Excess blank line before include.

> +#define N 20040
> +
> +void __attribute__ ((noinline, noclone))
> +test (int8_t *restrict dest, int8_t *restrict src)
> +{
> +  for (int i = 0; i < N; i+=8)
> +    {
> +      dest[i] += src[i * 4];
> +      dest[i+1] += src[i * 4 + 1];
> +      dest[i+2] += src[i * 4 + 2];
> +      dest[i+3] += src[i * 4 + 3];
> +      dest[i+4] += src[i * 4 + 4];
> +      dest[i+5] += src[i * 4 + 5];
> +      dest[i+6] += src[i * 4 + 6];
> +      dest[i+7] += src[i * 4 + 7];
> +    }
> +}
> +
> +
> +/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */
> +/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */
> +/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 1 } } */
> +
> +/* SVE spill, requires probing as vector size is unknown at compile time.  */

Same comments above forcing spilling and putting the comment before
the dg-finals.

Thanks,
Richard

Reply via email to