libgcc patch committed: Improve calling non-split-stack code

Ian Lance Taylor Tue, 20 Dec 2011 15:07:58 -0800

When the gold linker sees a call from -fsplit-stack code to code
compiled without -fsplit-stack, it arranges for the function to call
__morestack_non_split instead of __morestack.  The __morestack_non_split
function requests a larger stack, one that is large enough to run code
which doesn't know about split stacks.


In libgo.so, many functions refer to TLS variables, which means that
they are calling the function __tls_get_addr, which is in libc and is
not compiled with -fsplit-stack.  This means that many libgo functions
always split the stack.  In particular, many libgo functions always
split the stack, and then call other libgo functions which themselves
always split the stack.  This leads to the stack being split many times,
far more than necessary.

This libgcc patch to __morestack_non_split partially avoids this
problem.  Before this patch, __morestack_non_split would always simply
increment the requested stack size by 0x4000 and carry on, thus always
splitting the stack.  This patch changes it to first see whether there
is enough space on the current stack to handle the requested stack size
plus 0x4000.  If there is enough space, then it simply returns without
splitting the stack.  If there is not enough space, then it increments
the requested stack size by 0x5000.  The effect of this change is that
the first function which calls non-split-stack code allocates a large
stack, and that functions which it calls in turn can continue using the
same large stack (up to a point, of course).

Splitting the stack requires blocking and unblocking signals, which is a
system call.  Making this change reduced the number of times that the
libgo net/http testsuite calls sigprogmask from 3604100 to 26597, which
is about a 99%.

Bootstrapped and ran Go testsuite on x86_64-unknown-linux-gnu.  Also ran
the split-stack C tests.  Committed to mainline.

Ian

Index: config/i386/morestack.S
===================================================================
--- config/i386/morestack.S	(revision 182418)
+++ config/i386/morestack.S	(working copy)
@@ -96,13 +96,113 @@
 #endif
 
 __morestack_non_split:
+	.cfi_startproc
 
 #ifndef __x86_64__
-	addl	$0x4000,4(%esp)
+
+	# See below for an extended explanation of the CFI instructions.
+	.cfi_offset 8, 8		# New PC stored at CFA + 8
+	.cfi_escape 0x15, 4, 0x7d	# DW_CFA_val_offset_sf, %esp, 12/-4
+					# i.e., next %esp is CFA + 12
+
+	pushl	%eax			# Save %eax in case it is a parameter.
+
+	.cfi_def_cfa %esp,8		# Account for pushed register.
+
+	movl	%esp,%eax		# Current stack,
+	subl	8(%esp),%eax		# less required stack frame size,
+	subl	$0x4000,%eax		# less space for non-split code.
+	cmpl	%gs:0x30,%eax		# See if we have enough space.
+	jb	2f			# Get more space if we need it.
+
+	# Here the stack is
+	#	%esp + 20:	stack pointer after two returns
+	#	%esp + 16:	return address of morestack caller's caller
+	#	%esp + 12:	size of parameters
+	#	%esp + 8:	new stack frame size
+	#	%esp + 4:	return address of this function
+	#	%esp:		saved %eax
+	#
+	# Since we aren't doing a full split stack, we don't need to
+	# do anything when our caller returns.  So we return to our
+	# caller rather than calling it, and let it return as usual.
+	# To make that work we adjust the return address.
+
+	# This breaks call/return address prediction for the call to
+	# this function.  I can't figure out a way to make it work
+	# short of copying the parameters down the stack, which will
+	# probably take more clock cycles than we will lose breaking
+	# call/return address prediction.  We will only break
+	# prediction for this call, not for our caller.
+
+	movl	4(%esp),%eax		# Increment the return address
+	cmpb	$0xc3,(%eax)		# to skip the ret instruction;
+	je	1f			# see above.
+	addl	$2,%eax
+1:	inc	%eax
+	movl	%eax,4(%esp)		# Update return address.
+
+	popl	%eax			# Restore %eax and stack.
+
+	.cfi_def_cfa %esp,4		# Account for popped register.
+
+	ret	$8			# Return to caller, popping args.
+
+2:
+	.cfi_def_cfa %esp,8		# Back to where we were.
+
+	popl	%eax			# Restore %eax and stack.
+
+	.cfi_def_cfa %esp,4		# Account for popped register.
+
+	addl	$0x5000+BACKOFF,4(%esp)	# Increment space we request.
+
+	# Fall through into morestack.
+
+#else
+
+	# See below for an extended explanation of the CFI instructions.
+	.cfi_offset 16, 0
+	.cfi_escape 0x15, 7, 0x7f	# DW_CFA_val_offset_sf, %esp, 8/-8
+
+	pushq	%rax			# Save %rax in case caller is using
+					# it to preserve original %r10.
+	.cfi_def_cfa %rsp,16		# Adjust for pushed register.
+
+	movq	%rsp,%rax		# Current stack,
+	subq	%r10,%rax		# less required stack frame size,
+	subq	$0x4000,%rax		# less space for non-split code.
+
+#ifdef __LP64__
+	cmpq	%fs:0x70,%rax		# See if we have enough space.
 #else
-	addq	$0x4000,%r10
+	cmpl	%fs:0x40,%eax
+#endif
+	jb	2f			# Get more space if we need it.
+
+	# This breaks call/return prediction, as described above.
+	incq	8(%rsp)			# Increment the return address.
+
+	popq	%rax			# Restore register.
+
+	.cfi_def_cfa %rsp,8		# Adjust for popped register.
+
+	ret				# Return to caller.
+
+2:
+	.cfi_def_cfa %rsp,16		# Back to where we were.
+
+	popq	%rax			# Restore register.
+
+	.cfi_def_cfa %rsp,8		# Adjust for popped register.
+
+	addq	$0x5000+BACKOFF,%r10	# Increment space we request.
+
+	# Fall throug into morestack.
+
 #endif
 
+	.cfi_endproc
 #ifdef __ELF__
 	.size	__morestack_non_split, . - __morestack_non_split
 #endif

libgcc patch committed: Improve calling non-split-stack code

Reply via email to