Hi,

I've used your patch as the base and applied my changes on top.  The
attached patch is the result, so it is supposed to replace your
version.  It now also supports emitting a runtime loop.

It bootstraps fine but unfortunately I see an Ada regression which I
haven't tracked down yet.

> FAIL: cb1010a
> FAIL: gnat.dg/stack_check1.adb execution test
> FAIL: gnat.dg/stack_check1.adb execution test

Bye,

-Andreas-

diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index bbae89b..796ca76 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -11040,6 +11040,179 @@ pass_s390_early_mach::execute (function *fun)
 
 } // anon namespace
 
+/* Calculate TARGET = REG + OFFSET as s390_emit_prologue would do it.
+   - push too big immediates to the literal pool and annotate the refs
+   - emit frame related notes for stack pointer changes.  */
+
+static rtx
+s390_prologue_plus_offset (rtx target, rtx reg, rtx offset, bool 
frame_related_p)
+{
+  rtx insn;
+  rtx orig_offset = offset;
+
+  gcc_assert (REG_P (target));
+  gcc_assert (REG_P (reg));
+  gcc_assert (CONST_INT_P (offset));
+
+  if (offset == const0_rtx)                               /* lr/lgr */
+    {
+      insn = emit_move_insn (target, reg);
+    }
+  else if (DISP_IN_RANGE (INTVAL (offset)))               /* la */
+    {
+      insn = emit_move_insn (target, gen_rtx_PLUS (Pmode, reg,
+                                                  offset));
+    }
+  else
+    {
+      if (!satisfies_constraint_K (offset)                /* ahi/aghi */
+         && (!TARGET_EXTIMM
+             || (!satisfies_constraint_Op (offset)       /* alfi/algfi */
+                 && !satisfies_constraint_On (offset)))) /* slfi/slgfi */
+       offset = force_const_mem (Pmode, offset);
+
+      if (target != reg)
+       {
+         insn = emit_move_insn (target, reg);
+         RTX_FRAME_RELATED_P (insn) = frame_related_p ? 1 : 0;
+       }
+
+      insn = emit_insn (gen_add2_insn (target, offset));
+
+      if (!CONST_INT_P (offset))
+       {
+         annotate_constant_pool_refs (&PATTERN (insn));
+
+         if (frame_related_p)
+           add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+                         gen_rtx_SET (target,
+                                      gen_rtx_PLUS (Pmode, target,
+                                                    orig_offset)));
+       }
+    }
+
+  RTX_FRAME_RELATED_P (insn) = frame_related_p ? 1 : 0;
+
+  return insn;
+}
+
+/* Emit a compare instruction with a volatile memory access as stack
+   probe.  It does not waste store tags and does not clobber any
+   registers apart from the condition code.  */
+static void
+s390_emit_stack_probe (rtx addr)
+{
+  rtx tmp = gen_rtx_MEM (Pmode, addr);
+  MEM_VOLATILE_P (tmp) = 1;
+  s390_emit_compare (EQ, gen_rtx_REG (Pmode, 0), tmp);
+}
+
+#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
+#if (PROBE_INTERVAL - 4 > 4095)
+#error "S/390: stack probe offset must fit into short discplacement."
+#endif
+
+/* Use a runtime loop if we have to emit more probes than this.  */
+#define MIN_UNROLL_PROBES 3
+
+/* Allocate SIZE bytes of stack space, using TEMP_REG as a temporary
+   if necessary.  LAST_PROBE_OFFSET contains the offset of the closest
+   probe relative to the stack pointer.
+
+   Note that SIZE is negative.
+
+   The return value is true if TEMP_REG has been clobbered.  */
+static bool
+allocate_stack_space (rtx size, HOST_WIDE_INT last_probe_offset,
+                     rtx temp_reg)
+{
+  bool temp_reg_clobbered_p = false;
+
+  if (flag_stack_clash_protection)
+    {
+      if (last_probe_offset + -INTVAL (size) < PROBE_INTERVAL)
+       dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
+      else
+       {
+         rtx offset = GEN_INT (PROBE_INTERVAL - UNITS_PER_LONG);
+         HOST_WIDE_INT rounded_size = -INTVAL (size) & -PROBE_INTERVAL;
+         HOST_WIDE_INT num_probes = rounded_size / PROBE_INTERVAL;
+         HOST_WIDE_INT residual = -INTVAL (size) - rounded_size;
+
+         if (num_probes < MIN_UNROLL_PROBES)
+           {
+             /* Emit unrolled probe statements.  */
+
+             for (unsigned int i = 0; i < num_probes; i++)
+               {
+                 s390_prologue_plus_offset (stack_pointer_rtx,
+                                            stack_pointer_rtx,
+                                            GEN_INT (-PROBE_INTERVAL), true);
+                 s390_emit_stack_probe (gen_rtx_PLUS (Pmode,
+                                                      stack_pointer_rtx,
+                                                      offset));
+               }
+             dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
+           }
+         else
+           {
+             /* Emit a loop probing the pages.  */
+
+             rtx_code_label *loop_start_label = gen_label_rtx ();
+
+             /* From now on temp_reg will be the CFA register.  */
+             s390_prologue_plus_offset (temp_reg, stack_pointer_rtx,
+                                        GEN_INT (-rounded_size), true);
+             emit_label (loop_start_label);
+
+             s390_prologue_plus_offset (stack_pointer_rtx,
+                                        stack_pointer_rtx,
+                                        GEN_INT (-PROBE_INTERVAL), false);
+             s390_emit_stack_probe (gen_rtx_PLUS (Pmode,
+                                                  stack_pointer_rtx,
+                                                  offset));
+             emit_cmp_and_jump_insns (stack_pointer_rtx, temp_reg,
+                                      GT, NULL_RTX,
+                                      Pmode, 1, loop_start_label);
+
+             /* Without this make_edges ICEes.  */
+             JUMP_LABEL (get_last_insn ()) = loop_start_label;
+             LABEL_NUSES (loop_start_label) = 1;
+
+             /* That's going to be a NOP since stack pointer and
+                temp_reg are supposed to be the same here.  We just
+                emit it to set the CFA reg back to r15.  */
+             s390_prologue_plus_offset (stack_pointer_rtx, temp_reg,
+                                        const0_rtx, true);
+             temp_reg_clobbered_p = true;
+             dump_stack_clash_frame_info (PROBE_LOOP, residual != 0);
+           }
+
+         /* Handle any residual allocation request.  */
+         s390_prologue_plus_offset (stack_pointer_rtx,
+                                    stack_pointer_rtx,
+                                    GEN_INT (-residual), true);
+         last_probe_offset += residual;
+         if (last_probe_offset >= PROBE_INTERVAL)
+           s390_emit_stack_probe (gen_rtx_PLUS (Pmode,
+                                                stack_pointer_rtx,
+                                                GEN_INT (residual
+                                                         - UNITS_PER_LONG)));
+
+         emit_insn (gen_blockage ());
+
+         return temp_reg_clobbered_p;
+       }
+    }
+
+  /* Subtract frame size from stack pointer.  */
+  s390_prologue_plus_offset (stack_pointer_rtx,
+                            stack_pointer_rtx,
+                            size, true);
+
+  return temp_reg_clobbered_p;
+}
+
 /* Expand the prologue into a bunch of separate insns.  */
 
 void
@@ -11064,6 +11237,17 @@ s390_emit_prologue (void)
   else
     temp_reg = gen_rtx_REG (Pmode, 1);
 
+  /* When probing for stack-clash mitigation, we have to track the distance
+     between the stack pointer and closest known reference.
+
+     Most of the time we have to make a worst cast assumption.  The
+     only exception is when TARGET_BACKCHAIN is active, in which case
+     we know *sp (offset 0) was written.  */
+  HOST_WIDE_INT last_probe_offset
+    = (TARGET_BACKCHAIN
+       ? (TARGET_PACKED_STACK ? STACK_POINTER_OFFSET - UNITS_PER_LONG : 0)
+       : PROBE_INTERVAL - (STACK_BOUNDARY / UNITS_PER_WORD));
+
   s390_save_gprs_to_fprs ();
 
   /* Save call saved gprs.  */
@@ -11075,6 +11259,14 @@ s390_emit_prologue (void)
                                          - 
cfun_frame_layout.first_save_gpr_slot),
                        cfun_frame_layout.first_save_gpr,
                        cfun_frame_layout.last_save_gpr);
+
+      /* This is not 100% correct.  If we have more than one register saved,
+        then LAST_PROBE_OFFSET can move even closer to sp.  */
+      last_probe_offset
+       = (cfun_frame_layout.gprs_offset +
+          UNITS_PER_LONG * (cfun_frame_layout.first_save_gpr
+                            - cfun_frame_layout.first_save_gpr_slot));
+
       emit_insn (insn);
     }
 
@@ -11091,6 +11283,8 @@ s390_emit_prologue (void)
       if (cfun_fpr_save_p (i))
        {
          save_fpr (stack_pointer_rtx, offset, i);
+         if (offset < last_probe_offset)
+           last_probe_offset = offset;
          offset += 8;
        }
       else if (!TARGET_PACKED_STACK || cfun->stdarg)
@@ -11104,6 +11298,8 @@ s390_emit_prologue (void)
       if (cfun_fpr_save_p (i))
        {
          insn = save_fpr (stack_pointer_rtx, offset, i);
+         if (offset < last_probe_offset)
+           last_probe_offset = offset;
          offset += 8;
 
          /* If f4 and f6 are call clobbered they are saved due to
@@ -11126,6 +11322,8 @@ s390_emit_prologue (void)
        if (cfun_fpr_save_p (i))
          {
            insn = save_fpr (stack_pointer_rtx, offset, i);
+           if (offset < last_probe_offset)
+             last_probe_offset = offset;
 
            RTX_FRAME_RELATED_P (insn) = 1;
            offset -= 8;
@@ -11145,10 +11343,11 @@ s390_emit_prologue (void)
   if (cfun_frame_layout.frame_size > 0)
     {
       rtx frame_off = GEN_INT (-cfun_frame_layout.frame_size);
-      rtx real_frame_off;
+      rtx_insn *stack_pointer_backup_loc;
+      bool temp_reg_clobbered_p;
 
       if (s390_stack_size)
-       {
+       {
          HOST_WIDE_INT stack_guard;
 
          if (s390_stack_guard)
@@ -11214,35 +11413,36 @@ s390_emit_prologue (void)
       if (s390_warn_dynamicstack_p && cfun->calls_alloca)
        warning (0, "%qs uses dynamic stack allocation", current_function_name 
());
 
-      /* Save incoming stack pointer into temp reg.  */
-      if (TARGET_BACKCHAIN || next_fpr)
-       insn = emit_insn (gen_move_insn (temp_reg, stack_pointer_rtx));
+      /* Save the location where we could backup the incoming stack
+        pointer.  */
+      stack_pointer_backup_loc = get_last_insn ();
 
-      /* Subtract frame size from stack pointer.  */
+      temp_reg_clobbered_p = allocate_stack_space (frame_off, 
last_probe_offset,
+                                                  temp_reg);
 
-      if (DISP_IN_RANGE (INTVAL (frame_off)))
-       {
-         insn = gen_rtx_SET (stack_pointer_rtx,
-                             gen_rtx_PLUS (Pmode, stack_pointer_rtx,
-                                           frame_off));
-         insn = emit_insn (insn);
-       }
-      else
+      if (TARGET_BACKCHAIN || next_fpr)
        {
-         if (!CONST_OK_FOR_K (INTVAL (frame_off)))
-           frame_off = force_const_mem (Pmode, frame_off);
-
-          insn = emit_insn (gen_add2_insn (stack_pointer_rtx, frame_off));
-         annotate_constant_pool_refs (&PATTERN (insn));
+         if (temp_reg_clobbered_p)
+           {
+             /* allocate_stack_space had to make use of temp_reg and
+                we need it to hold a backup of the incoming stack
+                pointer.  Calculate back that value from the current
+                stack pointer.  */
+             s390_prologue_plus_offset (temp_reg, stack_pointer_rtx,
+                                        GEN_INT (cfun_frame_layout.frame_size),
+                                        false);
+           }
+         else
+           {
+             /* allocate_stack_space didn't actually required
+                temp_reg.  Insert the stack pointer backup insn
+                before the stack pointer decrement code - knowing now
+                that the value will survive.  */
+             emit_insn_after (gen_move_insn (temp_reg, stack_pointer_rtx),
+                              stack_pointer_backup_loc);
+           }
        }
 
-      RTX_FRAME_RELATED_P (insn) = 1;
-      real_frame_off = GEN_INT (-cfun_frame_layout.frame_size);
-      add_reg_note (insn, REG_FRAME_RELATED_EXPR,
-                   gen_rtx_SET (stack_pointer_rtx,
-                                gen_rtx_PLUS (Pmode, stack_pointer_rtx,
-                                              real_frame_off)));
-
       /* Set backchain.  */
 
       if (TARGET_BACKCHAIN)
@@ -11266,6 +11466,8 @@ s390_emit_prologue (void)
          emit_clobber (addr);
        }
     }
+  else if (flag_stack_clash_protection)
+    dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
 
   /* Save fprs 8 - 15 (64 bit ABI).  */
 
diff --git a/gcc/config/s390/s390.h b/gcc/config/s390/s390.h
index 7847047..8169624 100644
--- a/gcc/config/s390/s390.h
+++ b/gcc/config/s390/s390.h
@@ -1124,4 +1124,5 @@ extern const int processor_flags_table[];
     s390_register_target_pragmas ();           \
   } while (0)
 
+#define STACK_CHECK_STATIC_BUILTIN 1
 #endif /* S390_H */
diff --git a/gcc/testsuite/gcc.dg/stack-check-5.c 
b/gcc/testsuite/gcc.dg/stack-check-5.c
index 2ef6a19..4306de4 100644
--- a/gcc/testsuite/gcc.dg/stack-check-5.c
+++ b/gcc/testsuite/gcc.dg/stack-check-5.c
@@ -3,6 +3,10 @@
 /* { dg-require-effective-target supports_stack_clash_protection } */
 
 
+/* Otherwise the S/390 back-end might save the stack pointer in f2 ()
+   into an FPR.  */
+/* { dg-additional-options "-msoft-float" { target { s390x-*-* } } } */
+
 extern void foo (char *);
 extern void bar (void);
 

Reply via email to