On Wed, May 25, 2016 at 02:30:54PM +0100, Dominik Vogt wrote: > On Tue, May 03, 2016 at 03:17:53PM +0100, Dominik Vogt wrote: > > Version two of the patch including a test case. > > > > On Mon, May 02, 2016 at 09:10:25AM -0600, Jeff Law wrote: > > > On 04/29/2016 04:12 PM, Dominik Vogt wrote: > > > >The attached patch removes excess stack space allocation with > > > >alloca in some situations. Plese check the commit message in the > > > >patch for details. > > > > > However, I would strongly recommend some tests, even if they are > > > target specific. You can always copy pr36728-1 into the s390x > > > directory and look at size of the generated stack. Simliarly for > > > pr50938 for x86. > > > > However, x86 uses the "else" branch in round_push, i.e. it uses > > "virtual_preferred_stack_boundary_rtx" to calculate the number of > > bytes to add for stack alignment. That value is unknown at the > > time round_push is called, so the test case fails on such targets, > > and I've no idea how to fix this properly. > > Third version of the patch with the suggested cleanup in the first > patch and the functional stuff in the second one. The first patch > is based on Jeff's draft with the change suggested by Eric and > more cleanup added by me.
This is the updated funtional patch. Re-tested with limited effort, i.e. tested and bootstrapped on s390x biarch (but did not look for performance regressions compared to version 2 of the patch). Ciao Dominik ^_^ ^_^ -- Dominik Vogt IBM Germany
gcc/ChangeLog * explow.c (round_push): Use know adjustment. (allocate_dynamic_stack_space): Pass known adjustment to round_push. gcc/testsuite/ChangeLog * gcc.dg/pr50938.c: New test.
>From 4296d353e1d153b5b5ee435a44cae6117bf2fff0 Mon Sep 17 00:00:00 2001 From: Dominik Vogt <v...@linux.vnet.ibm.com> Date: Fri, 29 Apr 2016 08:36:59 +0100 Subject: [PATCH 2/2] Drop excess size used for run time allocated stack variables. The present calculation sometimes led to more stack memory being used than necessary with alloca. First, (STACK_BOUNDARY -1) would be added to the allocated size: size = plus_constant (Pmode, size, extra); size = force_operand (size, NULL_RTX); Then round_push was called and added another (STACK_BOUNDARY - 1) before rounding down to a multiple of STACK_BOUNDARY. On s390x this resulted in adding 14 before rounding down for "x" in the test case pr36728-1.c. round_push() now takes an argument to inform it about what has already been added to size. --- gcc/explow.c | 45 +++++++++++++++++++++--------------- gcc/testsuite/gcc.dg/pr50938.c | 52 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 18 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/pr50938.c diff --git a/gcc/explow.c b/gcc/explow.c index 09a0330..85596e2 100644 --- a/gcc/explow.c +++ b/gcc/explow.c @@ -949,24 +949,30 @@ anti_adjust_stack (rtx adjust) } /* Round the size of a block to be pushed up to the boundary required - by this machine. SIZE is the desired size, which need not be constant. */ + by this machine. SIZE is the desired size, which need not be constant. + ALREADY_ADDED is the number of units that have already been added to SIZE + for other alignment reasons. +*/ static rtx -round_push (rtx size) +round_push (rtx size, int already_added) { - rtx align_rtx, alignm1_rtx; + rtx align_rtx, add_rtx; if (!SUPPORTS_STACK_ALIGNMENT || crtl->preferred_stack_boundary == MAX_SUPPORTED_STACK_ALIGNMENT) { int align = crtl->preferred_stack_boundary / BITS_PER_UNIT; + int add; if (align == 1) return size; + add = (align > already_added) ? align - already_added - 1 : 0; + if (CONST_INT_P (size)) { - HOST_WIDE_INT new_size = (INTVAL (size) + align - 1) / align * align; + HOST_WIDE_INT new_size = (INTVAL (size) + add) / align * align; if (INTVAL (size) != new_size) size = GEN_INT (new_size); @@ -974,7 +980,7 @@ round_push (rtx size) } align_rtx = GEN_INT (align); - alignm1_rtx = GEN_INT (align - 1); + add_rtx = (add > 0) ? GEN_INT (add) : const0_rtx; } else { @@ -983,15 +989,15 @@ round_push (rtx size) substituted by the right value in vregs pass and optimized during combine. */ align_rtx = virtual_preferred_stack_boundary_rtx; - alignm1_rtx = force_operand (plus_constant (Pmode, align_rtx, -1), - NULL_RTX); + add_rtx = force_operand (plus_constant (Pmode, align_rtx, -1), NULL_RTX); } /* CEIL_DIV_EXPR needs to worry about the addition overflowing, but we know it can't. So add ourselves and then do TRUNC_DIV_EXPR. */ - size = expand_binop (Pmode, add_optab, size, alignm1_rtx, - NULL_RTX, 1, OPTAB_LIB_WIDEN); + if (add_rtx != const0_rtx) + size = expand_binop (Pmode, add_optab, size, add_rtx, + NULL_RTX, 1, OPTAB_LIB_WIDEN); size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx, NULL_RTX, 1); size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1); @@ -1174,7 +1180,7 @@ allocate_dynamic_stack_space (rtx size, unsigned size_align, HOST_WIDE_INT stack_usage_size = -1; rtx_code_label *final_label; rtx final_target, target; - unsigned extra; + unsigned extra = 0; /* If we're asking for zero bytes, it doesn't matter what we point to since we can't dereference it. But return a reasonable @@ -1251,15 +1257,18 @@ allocate_dynamic_stack_space (rtx size, unsigned size_align, example), so we must preventively align the value. We leave space in SIZE for the hole that might result from the alignment operation. */ - extra = (required_align - BITS_PER_UNIT) / BITS_PER_UNIT; - size = plus_constant (Pmode, size, extra); - size = force_operand (size, NULL_RTX); + if (required_align > BITS_PER_UNIT) + { + extra = (required_align - BITS_PER_UNIT) / BITS_PER_UNIT; + size = plus_constant (Pmode, size, extra); + size = force_operand (size, NULL_RTX); - if (flag_stack_usage_info) - stack_usage_size += extra; + if (flag_stack_usage_info) + stack_usage_size += extra; - if (extra && size_align > BITS_PER_UNIT) - size_align = BITS_PER_UNIT; + if (size_align > BITS_PER_UNIT) + size_align = BITS_PER_UNIT; + } /* Round the size to a multiple of the required stack alignment. Since the stack if presumed to be rounded before this allocation, @@ -1276,7 +1285,7 @@ allocate_dynamic_stack_space (rtx size, unsigned size_align, momentarily mis-aligning the stack. */ if (size_align % MAX_SUPPORTED_STACK_ALIGNMENT != 0) { - size = round_push (size); + size = round_push (size, extra); if (flag_stack_usage_info) { diff --git a/gcc/testsuite/gcc.dg/pr50938.c b/gcc/testsuite/gcc.dg/pr50938.c new file mode 100644 index 0000000..14b7540 --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr50938.c @@ -0,0 +1,52 @@ +/* PR/50938: Check that alloca () reserves the correct amount of stack space. + */ + +/* { dg-do run } */ +/* { dg-options "-O0" } */ +/* { dg-require-effective-target alloca } */ + +__attribute__ ((noinline)) +static void +dummy (void) +{ +} + +__attribute__ ((noinline)) +static char * +get_frame_addr (void *p) +{ + void *faddr = __builtin_frame_address (0); + /* Call a function to make sure that a stack frame exists. */ + dummy (); + return faddr; +} + +__attribute__ ((noinline)) +static void * +stack_var (void) +{ + /* 1024 bytes on the stack. */ + char s[1024]; + /* One stack slot. */ + void *p = (void *)s; + /* Keep stack memory alive by passing it to the function. */ + return get_frame_addr (p); +} + +__attribute__ ((noinline)) +static void * +alloca_var (void) +{ + /* 1024 bytes on the stack plus one stack slot for a. */ + void *a = __builtin_alloca (1024); + return get_frame_addr (a); +} + +int main (void) +{ + if (stack_var () != alloca_var ()) + /* The functions used a different amount of stack space. */ + __builtin_abort (); + + return 0; +} -- 2.3.0