Hi,

Sorry for the delay - I finally had a chance to look at this again. 
I'll start with alloca:

@@ -15245,6 +15455,28 @@ aarch64_sched_can_speculate_insn (rtx_insn *insn)
     }
 }
 
+/* It has been decided that to allow up to 1kb of outgoing argument
+   space to be allocated w/o probing.  If more than 1kb of outgoing
+   argment space is allocated, then it must be probed and the last
+   probe must occur no more than 1kbyte away from the end of the
+   allocated space.
+
+   This implies that the residual part of an alloca allocation may
+   need probing in cases where the generic code might not otherwise
+   think a probe is needed.
+
+   This target hook returns TRUE when allocating RESIDUAL bytes of
+   alloca space requires an additional probe, otherwise FALSE is
+   returned.  */
+
+static bool
+aarch64_stack_clash_protection_final_dynamic_probe (rtx residual)
+{
+  return (residual == CONST0_RTX (Pmode)
+         || GET_CODE (residual) != CONST_INT
+         || INTVAL (residual) >= 1024);
+}
+

The const0 check is wrong - for alloca(0) we do not want to insert a probe!

I don't get how this works - probes are done at strange offsets, there is always
a final probe (even if residual is < 1024), and the alignment info seems to be 
lost
somehow, so generated code end up quite bad:

void *p;
void small_alloca (int x)
{
  if (x > 100)
   p = __builtin_alloca (4096);
}

        sub     sp, sp, #4096
        str     xzr, [sp, 4088]
        mov     x0, sp
        str     xzr, [x0], 15      *** +15 and extra probe
        and     x0, x0, -16        *** already aligned

The same unnecessary probe happens for a variable alloca after the loop:

void alloca (int x)
{
  if (x > 100)
   p = __builtin_alloca (x);
}

        add     x0, x0, 15
        and     x0, x0, -16
        and     x1, x0, -4096
        sub     x1, sp, x1
.L22:
        mov     x2, sp
        cmp     x2, x1
        beq     .L23
        sub     sp, sp, #4096
        str     xzr, [sp, 4088]
        b       .L22
.L23:
        and     x0, x0, 4095
        sub     x1, x0, #8
        sub     sp, sp, x0
        str     xzr, [sp, x1]
        str     xzr, [sp, -8]
        mov     x1, sp


That does lots of unnecessary operations, including always an extra probe 
(which isn't needed since the final adjustment is always less than the max
probe distance). I think it should be:

        add     x0, x0, 15
        subs x1, x0, 4096
        bhi .L22
.L23:
        and     x0, x0, 4080
        sub     sp, sp, x0
        str     xzr, [sp, probe_offset - 8]
        mov     x1, sp


.L22: (loop marked as unlikely so out of the way)
        subs x1, x1, 4096
        sub     sp, sp, #4096
        str     xzr, [sp, probe_offset]
        bhi .L22
        b .L23

On AArch64 probe_offset would be min (1024, outgoing_args_size). On other
targets it could be 0 or (probe distance - 8) depending on the probe design. 
This
means you always do exactly the number of probes that are needed and avoid
the corner case of alloca (0). Typically only extra executed instructions are a
compare+non-taken branch plus a store. Codesize overhead is reduced by a
third (from 12 to 8 instructions).

Wilco    

Reply via email to