[PHP-DEV] more inline assembler noise

Ard Biesheuvel Fri, 18 Jan 2013 10:09:46 -0800

Hello,

Again, apologies for prematurely declaring someone else's code 'crap'.There are no bugs in the inline x86 assembler in Zend/zend_operators.h,as far as I can tell, only two kinds of issues that I still think shouldbe addressed.

First of all, from a maintenance pov, having field offsets (like theoffset of zval.type) and constants (like $0x2 for IS_DOUBLE) hard codedinside the instructions is a bad idea.

The other issue is the branching and the floating point instructions.The inline assembler addresses the common case, but also adds a bunch ofinstructions that address the corner case, and some branches to jumpover them. As I indicated in my previous email, branching is relativelycostly on a modern CPU with deep pipelines and having a bunch of FPUinstructions in there that hardly ever get executed doesn't help either.

The primary reason for having inline assembler at all is the ability todetect overflow. This mainly applies to multiplication, as in that case,detecting overflow in C code is much harder compared to reading acondition flag in the CPU (hence the various accelerated implementationsin zend_signed_multiply.h). However, detecting overflow inaddition/subtraction implemented in C is much easier, as the code inzend_operators.h proves: just a matter of checking the sign bits, ordoing a simple compare with LONG_MIN/LONG_MAX.

Therefore, I would be interested in finding out which benchmark was usedto make the case for having these accelerated implementations in thefirst place. The differences in performance between variousimplementations are very small in the tests I have done.

As for the code style/maintainability, I propose to apply the attachedpatch. The performance is on par, as far as I can tell, but it isarguably better code. I will also hook in the ARM versions once I manageto prove that the performance is affected favourably by them.


Regards,
Ard.



Before
-------

$ time php -r 'for ($i = 0; $i < 0x7fffffff; $i++);'

real    0m56.910s
user    0m56.876s
sys     0m0.008s


$ time php -r 'for ($i = 0x7fffffff; $i >= 0; $i--);'

real    1m34.576s
user    1m34.518s
sys     0m0.020s


$ time php -r 'for ($i = 0; $i < 0x7fffffff; $i += 3);'

real    0m21.494s
user    0m21.473s
sys     0m0.008s


$ time php -r 'for ($i = 0x7fffffff; $i >= 0; $i -= 3);'

real    0m19.879s
user    0m19.865s
sys     0m0.004s


After
-----

$ time php -r 'for ($i = 0; $i < 0x7fffffff; $i++);'

real    0m56.687s
user    0m56.656s
sys     0m0.004s


$ time php -r 'for ($i = 0x7fffffff; $i >= 0; $i--);'

real    1m28.124s
user    1m28.082s
sys     0m0.004s


$ time php -r 'for ($i = 0; $i < 0x7fffffff; $i += 3);'

real    0m20.561s
user    0m20.545s
sys     0m0.004s


$ time php -r 'for ($i = 0x7fffffff; $i >= 0; $i -= 3);'

real    0m20.524s
user    0m20.509s
sys     0m0.004s

>From c9c8847713d22058ee47beffad54f2159f199cb9 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheu...@linaro.org>
Date: Fri, 18 Jan 2013 18:31:57 +0100
Subject: [PATCH] Improve x86 assembler implementations of add/sub int
 arithmetic

---
 Zend/zend_operators.h |  182 +++++++++++++++----------------------------------
 1 file changed, 56 insertions(+), 126 deletions(-)

diff --git a/Zend/zend_operators.h b/Zend/zend_operators.h
index 20a5277..b34ddfc 100644
--- a/Zend/zend_operators.h
+++ b/Zend/zend_operators.h
@@ -479,35 +479,27 @@ ZEND_API void zend_update_current_locale(void);
 static zend_always_inline int fast_increment_function(zval *op1)
 {
 	if (EXPECTED(Z_TYPE_P(op1) == IS_LONG)) {
+		unsigned char overflow;
 #if defined(__GNUC__) && defined(__i386__)
 		__asm__(
-			"incl (%0)\n\t"
-			"jno  0f\n\t"
-			"movl $0x0, (%0)\n\t"
-			"movl $0x41e00000, 0x4(%0)\n\t"
-			"movb $0x2,0xc(%0)\n"
-			"0:"
-			:
-			: "r"(op1));
+			"incl (%1)\n\t"
+			"seto %b0"
+			: "=q"(overflow)
+			: "r"(&Z_LVAL_P(op1)));
 #elif defined(__GNUC__) && defined(__x86_64__)
 		__asm__(
-			"incq (%0)\n\t"
-			"jno  0f\n\t"
-			"movl $0x0, (%0)\n\t"
-			"movl $0x43e00000, 0x4(%0)\n\t"
-			"movb $0x2,0x14(%0)\n"
-			"0:"
-			:
-			: "r"(op1));
+			"incq (%1)\n\t"
+			"seto %b0"
+			: "=q"(overflow)
+			: "r"(&Z_LVAL_P(op1)));
 #else
-		if (UNEXPECTED(Z_LVAL_P(op1) == LONG_MAX)) {
+		overflow = (Z_LVAL_P(op1)++ == LONG_MAX);
+#endif
+		if (UNEXPECTED(overflow)) {
 			/* switch to double */
 			Z_DVAL_P(op1) = (double)LONG_MAX + 1.0;
 			Z_TYPE_P(op1) = IS_DOUBLE;
-		} else {
-			Z_LVAL_P(op1)++;
 		}
-#endif
 		return SUCCESS;
 	}
 	return increment_function(op1);
@@ -516,35 +508,27 @@ static zend_always_inline int fast_increment_function(zval *op1)
 static zend_always_inline int fast_decrement_function(zval *op1)
 {
 	if (EXPECTED(Z_TYPE_P(op1) == IS_LONG)) {
+		unsigned char overflow;
 #if defined(__GNUC__) && defined(__i386__)
 		__asm__(
-			"decl (%0)\n\t"
-			"jno  0f\n\t"
-			"movl $0x00200000, (%0)\n\t"
-			"movl $0xc1e00000, 0x4(%0)\n\t"
-			"movb $0x2,0xc(%0)\n"
-			"0:"
-			:
-			: "r"(op1));
+			"decl (%1)\n\t"
+			"seto %b0"
+			: "=q"(overflow)
+			: "r"(&Z_LVAL_P(op1)));
 #elif defined(__GNUC__) && defined(__x86_64__)
 		__asm__(
-			"decq (%0)\n\t"
-			"jno  0f\n\t"
-			"movl $0x00000000, (%0)\n\t"
-			"movl $0xc3e00000, 0x4(%0)\n\t"
-			"movb $0x2,0x14(%0)\n"
-			"0:"
-			:
-			: "r"(op1));
+			"decq (%1)\n\t"
+			"seto %b0"
+			: "=q"(overflow)
+			: "r"(&Z_LVAL_P(op1)));
 #else
-		if (UNEXPECTED(Z_LVAL_P(op1) == LONG_MIN)) {
+		overflow = (Z_LVAL_P(op1)-- == LONG_MIN);
+#endif
+		if (UNEXPECTED(overflow)) {
 			/* switch to double */
 			Z_DVAL_P(op1) = (double)LONG_MIN - 1.0;
 			Z_TYPE_P(op1) = IS_DOUBLE;
-		} else {
-			Z_LVAL_P(op1)--;
 		}
-#endif
 		return SUCCESS;
 	}
 	return decrement_function(op1);
@@ -554,57 +538,34 @@ static zend_always_inline int fast_add_function(zval *result, zval *op1, zval *o
 {
 	if (EXPECTED(Z_TYPE_P(op1) == IS_LONG)) {
 		if (EXPECTED(Z_TYPE_P(op2) == IS_LONG)) {
+			unsigned char overflow;
 #if defined(__GNUC__) && defined(__i386__)
 		__asm__(
-			"movl	(%1), %%eax\n\t"
-			"addl   (%2), %%eax\n\t"
-			"jo     0f\n\t"     
-			"movl   %%eax, (%0)\n\t"
-			"movb   $0x1,0xc(%0)\n\t"
-			"jmp    1f\n"
-			"0:\n\t"
-			"fildl	(%1)\n\t"
-			"fildl	(%2)\n\t"
-			"faddp	%%st, %%st(1)\n\t"
-			"movb   $0x2,0xc(%0)\n\t"
-			"fstpl	(%0)\n"
-			"1:"
-			: 
-			: "r"(result),
-			  "r"(op1),
-			  "r"(op2)
-			: "eax");
+			"movl (%2), %0\n\t"
+			"addl (%3), %0\n\t"
+			"seto %b1"
+			: "=&r"(Z_LVAL_P(result)), "=q"(overflow)
+			: "r"(&Z_LVAL_P(op1)), "r"(&Z_LVAL_P(op2)));
 #elif defined(__GNUC__) && defined(__x86_64__)
 		__asm__(
-			"movq	(%1), %%rax\n\t"
-			"addq   (%2), %%rax\n\t"
-			"jo     0f\n\t"     
-			"movq   %%rax, (%0)\n\t"
-			"movb   $0x1,0x14(%0)\n\t"
-			"jmp    1f\n"
-			"0:\n\t"
-			"fildq	(%1)\n\t"
-			"fildq	(%2)\n\t"
-			"faddp	%%st, %%st(1)\n\t"
-			"movb   $0x2,0x14(%0)\n\t"
-			"fstpl	(%0)\n"
-			"1:"
-			: 
-			: "r"(result),
-			  "r"(op1),
-			  "r"(op2)
-			: "rax");
+			"movq (%2), %0\n\t"
+			"addq (%3), %0\n\t"
+			"seto %b1"
+			: "=&r"(Z_LVAL_P(result)), "=q"(overflow)
+			: "r"(&Z_LVAL_P(op1)), "r"(&Z_LVAL_P(op2)));
 #else
 			Z_LVAL_P(result) = Z_LVAL_P(op1) + Z_LVAL_P(op2);
 
-			if (UNEXPECTED((Z_LVAL_P(op1) & LONG_SIGN_MASK) == (Z_LVAL_P(op2) & LONG_SIGN_MASK)
-				&& (Z_LVAL_P(op1) & LONG_SIGN_MASK) != (Z_LVAL_P(result) & LONG_SIGN_MASK))) {
+			overflow = (Z_LVAL_P(op1) & LONG_SIGN_MASK) == (Z_LVAL_P(op2) & LONG_SIGN_MASK)
+				&& (Z_LVAL_P(op1) & LONG_SIGN_MASK) != (Z_LVAL_P(result) & LONG_SIGN_MASK);
+#endif
+
+			if (UNEXPECTED(overflow)) {
 				Z_DVAL_P(result) = (double) Z_LVAL_P(op1) + (double) Z_LVAL_P(op2);
 				Z_TYPE_P(result) = IS_DOUBLE;
 			} else {
 				Z_TYPE_P(result) = IS_LONG;
 			}
-#endif
 			return SUCCESS;
 		} else if (EXPECTED(Z_TYPE_P(op2) == IS_DOUBLE)) {
 			Z_DVAL_P(result) = ((double)Z_LVAL_P(op1)) + Z_DVAL_P(op2);
@@ -629,65 +590,34 @@ static zend_always_inline int fast_sub_function(zval *result, zval *op1, zval *o
 {
 	if (EXPECTED(Z_TYPE_P(op1) == IS_LONG)) {
 		if (EXPECTED(Z_TYPE_P(op2) == IS_LONG)) {
+			unsigned char overflow;
 #if defined(__GNUC__) && defined(__i386__)
 		__asm__(
-			"movl	(%1), %%eax\n\t"
-			"subl   (%2), %%eax\n\t"
-			"jo     0f\n\t"     
-			"movl   %%eax, (%0)\n\t"
-			"movb   $0x1,0xc(%0)\n\t"
-			"jmp    1f\n"
-			"0:\n\t"
-			"fildl	(%2)\n\t"
-			"fildl	(%1)\n\t"
-#if defined(__clang__) && (__clang_major__ < 2 || (__clang_major__ == 2 && __clang_minor__ < 10))
-			"fsubp  %%st(1), %%st\n\t"  /* LLVM bug #9164 */
-#else
-			"fsubp	%%st, %%st(1)\n\t"
-#endif
-			"movb   $0x2,0xc(%0)\n\t"
-			"fstpl	(%0)\n"
-			"1:"
-			: 
-			: "r"(result),
-			  "r"(op1),
-			  "r"(op2)
-			: "eax");
+			"movl (%2), %0\n\t"
+			"subl (%3), %0\n\t"
+			"seto %b1"
+			: "=&r"(Z_LVAL_P(result)), "=q"(overflow)
+			: "r"(&Z_LVAL_P(op1)), "r"(&Z_LVAL_P(op2)));
 #elif defined(__GNUC__) && defined(__x86_64__)
 		__asm__(
-			"movq	(%1), %%rax\n\t"
-			"subq   (%2), %%rax\n\t"
-			"jo     0f\n\t"     
-			"movq   %%rax, (%0)\n\t"
-			"movb   $0x1,0x14(%0)\n\t"
-			"jmp    1f\n"
-			"0:\n\t"
-			"fildq	(%2)\n\t"
-			"fildq	(%1)\n\t"
-#if defined(__clang__) && (__clang_major__ < 2 || (__clang_major__ == 2 && __clang_minor__ < 10))
-			"fsubp  %%st(1), %%st\n\t"  /* LLVM bug #9164 */
-#else
-			"fsubp	%%st, %%st(1)\n\t"
-#endif
-			"movb   $0x2,0x14(%0)\n\t"
-			"fstpl	(%0)\n"
-			"1:"
-			: 
-			: "r"(result),
-			  "r"(op1),
-			  "r"(op2)
-			: "rax");
+			"movq (%2), %0\n\t"
+			"subq (%3), %0\n\t"
+			"seto %b1"
+			: "=&r"(Z_LVAL_P(result)), "=q"(overflow)
+			: "r"(&Z_LVAL_P(op1)), "r"(&Z_LVAL_P(op2)));
 #else
 			Z_LVAL_P(result) = Z_LVAL_P(op1) - Z_LVAL_P(op2);
 
-			if (UNEXPECTED((Z_LVAL_P(op1) & LONG_SIGN_MASK) != (Z_LVAL_P(op2) & LONG_SIGN_MASK)
-				&& (Z_LVAL_P(op1) & LONG_SIGN_MASK) != (Z_LVAL_P(result) & LONG_SIGN_MASK))) {
+			overflow = (Z_LVAL_P(op1) & LONG_SIGN_MASK) != (Z_LVAL_P(op2) & LONG_SIGN_MASK)
+				&& (Z_LVAL_P(op1) & LONG_SIGN_MASK) != (Z_LVAL_P(result) & LONG_SIGN_MASK);
+#endif
+
+			if (UNEXPECTED(overflow)) {
 				Z_DVAL_P(result) = (double) Z_LVAL_P(op1) - (double) Z_LVAL_P(op2);
 				Z_TYPE_P(result) = IS_DOUBLE;
 			} else {
 				Z_TYPE_P(result) = IS_LONG;
 			}
-#endif
 			return SUCCESS;
 		} else if (EXPECTED(Z_TYPE_P(op2) == IS_DOUBLE)) {
 			Z_DVAL_P(result) = ((double)Z_LVAL_P(op1)) - Z_DVAL_P(op2);
-- 
1.7.10.4

-- 
PHP Internals - PHP Runtime Development Mailing List
To unsubscribe, visit: http://www.php.net/unsub.php

[PHP-DEV] more inline assembler noise

Reply via email to