On Wed, May 25, 2016 at 02:30:54PM +0100, Dominik Vogt wrote:
> On Tue, May 03, 2016 at 03:17:53PM +0100, Dominik Vogt wrote:
> > Version two of the patch including a test case.
> > 
> > On Mon, May 02, 2016 at 09:10:25AM -0600, Jeff Law wrote:
> > > On 04/29/2016 04:12 PM, Dominik Vogt wrote:
> > > >The attached patch removes excess stack space allocation with
> > > >alloca in some situations.  Plese check the commit message in the
> > > >patch for details.
> > 
> > > However, I would strongly recommend some tests, even if they are
> > > target specific.  You can always copy pr36728-1 into the s390x
> > > directory and look at size of the generated stack.  Simliarly for
> > > pr50938 for x86.
> > 
> > However, x86 uses the "else" branch in round_push, i.e. it uses
> > "virtual_preferred_stack_boundary_rtx" to calculate the number of
> > bytes to add for stack alignment.  That value is unknown at the
> > time round_push is called, so the test case fails on such targets,
> > and I've no idea how to fix this properly.
> 
> Third version of the patch with the suggested cleanup in the first
> patch and the functional stuff in the second one.  The first patch
> is based on Jeff's draft with the change suggested by Eric and
> more cleanup added by me.

This is the updated funtional patch.  Re-tested with limited
effort, i.e. tested and bootstrapped on s390x biarch (but did not
look for performance regressions compared to version 2 of the
patch).

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany
gcc/ChangeLog

        * explow.c (round_push): Use know adjustment.
        (allocate_dynamic_stack_space): Pass known adjustment to round_push.
gcc/testsuite/ChangeLog

        * gcc.dg/pr50938.c: New test.
>From 4296d353e1d153b5b5ee435a44cae6117bf2fff0 Mon Sep 17 00:00:00 2001
From: Dominik Vogt <v...@linux.vnet.ibm.com>
Date: Fri, 29 Apr 2016 08:36:59 +0100
Subject: [PATCH 2/2] Drop excess size used for run time allocated stack
 variables.

The present calculation sometimes led to more stack memory being used than
necessary with alloca.  First, (STACK_BOUNDARY -1) would be added to the
allocated size:

  size = plus_constant (Pmode, size, extra);
  size = force_operand (size, NULL_RTX);

Then round_push was called and added another (STACK_BOUNDARY - 1) before
rounding down to a multiple of STACK_BOUNDARY.  On s390x this resulted in
adding 14 before rounding down for "x" in the test case pr36728-1.c.

round_push() now takes an argument to inform it about what has already been
added to size.
---
 gcc/explow.c                   | 45 +++++++++++++++++++++---------------
 gcc/testsuite/gcc.dg/pr50938.c | 52 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr50938.c

diff --git a/gcc/explow.c b/gcc/explow.c
index 09a0330..85596e2 100644
--- a/gcc/explow.c
+++ b/gcc/explow.c
@@ -949,24 +949,30 @@ anti_adjust_stack (rtx adjust)
 }
 
 /* Round the size of a block to be pushed up to the boundary required
-   by this machine.  SIZE is the desired size, which need not be constant.  */
+   by this machine.  SIZE is the desired size, which need not be constant.
+   ALREADY_ADDED is the number of units that have already been added to SIZE
+   for other alignment reasons.
+*/
 
 static rtx
-round_push (rtx size)
+round_push (rtx size, int already_added)
 {
-  rtx align_rtx, alignm1_rtx;
+  rtx align_rtx, add_rtx;
 
   if (!SUPPORTS_STACK_ALIGNMENT
       || crtl->preferred_stack_boundary == MAX_SUPPORTED_STACK_ALIGNMENT)
     {
       int align = crtl->preferred_stack_boundary / BITS_PER_UNIT;
+      int add;
 
       if (align == 1)
 	return size;
 
+      add = (align > already_added) ? align - already_added - 1 : 0;
+
       if (CONST_INT_P (size))
 	{
-	  HOST_WIDE_INT new_size = (INTVAL (size) + align - 1) / align * align;
+	  HOST_WIDE_INT new_size = (INTVAL (size) + add) / align * align;
 
 	  if (INTVAL (size) != new_size)
 	    size = GEN_INT (new_size);
@@ -974,7 +980,7 @@ round_push (rtx size)
 	}
 
       align_rtx = GEN_INT (align);
-      alignm1_rtx = GEN_INT (align - 1);
+      add_rtx = (add > 0) ? GEN_INT (add) : const0_rtx;
     }
   else
     {
@@ -983,15 +989,15 @@ round_push (rtx size)
 	 substituted by the right value in vregs pass and optimized
 	 during combine.  */
       align_rtx = virtual_preferred_stack_boundary_rtx;
-      alignm1_rtx = force_operand (plus_constant (Pmode, align_rtx, -1),
-				   NULL_RTX);
+      add_rtx = force_operand (plus_constant (Pmode, align_rtx, -1), NULL_RTX);
     }
 
   /* CEIL_DIV_EXPR needs to worry about the addition overflowing,
      but we know it can't.  So add ourselves and then do
      TRUNC_DIV_EXPR.  */
-  size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
-		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
+  if (add_rtx != const0_rtx)
+    size = expand_binop (Pmode, add_optab, size, add_rtx,
+			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
   size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
 			NULL_RTX, 1);
   size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
@@ -1174,7 +1180,7 @@ allocate_dynamic_stack_space (rtx size, unsigned size_align,
   HOST_WIDE_INT stack_usage_size = -1;
   rtx_code_label *final_label;
   rtx final_target, target;
-  unsigned extra;
+  unsigned extra = 0;
 
   /* If we're asking for zero bytes, it doesn't matter what we point
      to since we can't dereference it.  But return a reasonable
@@ -1251,15 +1257,18 @@ allocate_dynamic_stack_space (rtx size, unsigned size_align,
      example), so we must preventively align the value.  We leave space
      in SIZE for the hole that might result from the alignment operation.  */
 
-  extra = (required_align - BITS_PER_UNIT) / BITS_PER_UNIT;
-  size = plus_constant (Pmode, size, extra);
-  size = force_operand (size, NULL_RTX);
+  if (required_align > BITS_PER_UNIT)
+    {
+      extra = (required_align - BITS_PER_UNIT) / BITS_PER_UNIT;
+      size = plus_constant (Pmode, size, extra);
+      size = force_operand (size, NULL_RTX);
 
-  if (flag_stack_usage_info)
-    stack_usage_size += extra;
+      if (flag_stack_usage_info)
+	stack_usage_size += extra;
 
-  if (extra && size_align > BITS_PER_UNIT)
-    size_align = BITS_PER_UNIT;
+      if (size_align > BITS_PER_UNIT)
+	size_align = BITS_PER_UNIT;
+    }
 
   /* Round the size to a multiple of the required stack alignment.
      Since the stack if presumed to be rounded before this allocation,
@@ -1276,7 +1285,7 @@ allocate_dynamic_stack_space (rtx size, unsigned size_align,
      momentarily mis-aligning the stack.  */
   if (size_align % MAX_SUPPORTED_STACK_ALIGNMENT != 0)
     {
-      size = round_push (size);
+      size = round_push (size, extra);
 
       if (flag_stack_usage_info)
 	{
diff --git a/gcc/testsuite/gcc.dg/pr50938.c b/gcc/testsuite/gcc.dg/pr50938.c
new file mode 100644
index 0000000..14b7540
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr50938.c
@@ -0,0 +1,52 @@
+/* PR/50938: Check that alloca () reserves the correct amount of stack space.
+ */
+
+/* { dg-do run } */
+/* { dg-options "-O0" } */
+/* { dg-require-effective-target alloca } */
+
+__attribute__ ((noinline))
+static void
+dummy (void)
+{
+}
+
+__attribute__ ((noinline))
+static char *
+get_frame_addr (void *p)
+{
+  void *faddr = __builtin_frame_address (0);
+  /* Call a function to make sure that a stack frame exists.  */
+  dummy ();
+  return faddr;
+}
+
+__attribute__ ((noinline))
+static void *
+stack_var (void)
+{
+  /* 1024 bytes on the stack.  */
+  char s[1024];
+  /* One stack slot.  */
+  void *p = (void *)s;
+  /* Keep stack memory alive by passing it to the function.  */
+  return get_frame_addr (p);
+}
+
+__attribute__ ((noinline))
+static void *
+alloca_var (void)
+{
+  /* 1024 bytes on the stack plus one stack slot for a.  */
+  void *a = __builtin_alloca (1024);
+  return get_frame_addr (a);
+}
+
+int main (void)
+{
+  if (stack_var () != alloca_var ())
+    /* The functions used a different amount of stack space.  */
+    __builtin_abort ();
+
+  return 0;
+}
-- 
2.3.0

Reply via email to