mode) if either src or dst address is not 8 bytes aligned. It can be
opmitized if both addresses are with the same offset with 8 bytes boundary.
memcmp() can align the src/dst address with 8 bytes firstly and then
compare with .Llong mode.
This patch optmizes memcmp() behavior in this situation.
Test result:
(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
50.715169506 seconds time elapsed
( +- 0.04% )
- with patch
28.906602373 seconds time elapsed
( +- 0.02% )
-> There is ~+75% percent improvement.
(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
#include <string.h>
#include "utils.h"
-#define SIZE 256
+#define SIZE 32
#define ITERATIONS 10000
int test_memcmp(const void *s1, const void *s2, size_t n);
--------
- Without patch
0.390677136 seconds time elapsed
( +- 0.03% )
- with patch
0.375685926 seconds time elapsed
( +- 0.05% )
-> There is ~+4% improvement
(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
#include <string.h>
#include "utils.h"
-#define SIZE 256
-#define ITERATIONS 10000
+#define SIZE 8
+#define ITERATIONS 100000
int test_memcmp(const void *s1, const void *s2, size_t n);
-------
- Without patch
3.169203981 seconds time elapsed
( +- 0.23% )
- With patch
3.208257362 seconds time elapsed
( +- 0.13% )
-> There is ~ -1% decrease.
(I don't know why yet, since there are the same number of instructions
in the code path for 0~8 bytes memcmp() with/without this patch. Any
comments will be appreciated).
Signed-off-by: Simon Guo <wei.guo.si...@gmail.com>
---
arch/powerpc/lib/memcmp_64.S | 86 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 82 insertions(+), 4 deletions(-)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b..6dbafdb 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -24,25 +24,95 @@
#define rH r31
#ifdef __LITTLE_ENDIAN__
+#define LH lhbrx
+#define LW lwbrx
#define LD ldbrx
#else
+#define LH lhzx
+#define LW lwzx
#define LD ldx
#endif
_GLOBAL(memcmp)
cmpdi cr1,r5,0
- /* Use the short loop if both strings are not 8B aligned */
- or r6,r3,r4
+ /* Use the short loop if the src/dst addresses are not
+ * with the same offset of 8 bytes align boundary.
+ */
+ xor r6,r3,r4
andi. r6,r6,7
- /* Use the short loop if length is less than 32B */
- cmpdi cr6,r5,31
+ /* fall back to short loop if compare at aligned addrs
+ * with no greater than 8 bytes.
+ */
+ cmpdi cr6,r5,8
beq cr1,.Lzero
bne .Lshort
+ ble cr6,.Lshort
+
+.Lalignbytes_start:
+ /* The bits 0/1/2 of src/dst addr are the same. */
+ neg r0,r3
+ andi. r0,r0,7
+ beq .Lalign8bytes
+
+ PPC_MTOCRF(1,r0)
+ bf 31,.Lalign2bytes
+ lbz rA,0(r3)
+ lbz rB,0(r4)
+ cmplw cr0,rA,rB
+ bne cr0,.LcmpAB_lightweight
+ addi r3,r3,1
+ addi r4,r4,1
+ subi r5,r5,1
+.Lalign2bytes:
+ bf 30,.Lalign4bytes
+ LH rA,0,r3
+ LH rB,0,r4
+ cmplw cr0,rA,rB
+ bne cr0,.LcmpAB_lightweight
+ bne .Lnon_zero
+ addi r3,r3,2
+ addi r4,r4,2
+ subi r5,r5,2
+.Lalign4bytes:
+ bf 29,.Lalign8bytes
+ LW rA,0,r3
+ LW rB,0,r4
+ cmpld cr0,rA,rB
+ bne cr0,.LcmpAB_lightweight
+ addi r3,r3,4
+ addi r4,r4,4
+ subi r5,r5,4
+.Lalign8bytes:
+ /* Now addrs are aligned with 8 bytes. Use the short loop if left
+ * bytes are less than 8B.
+ */
+ cmpdi cr6,r5,7
+ ble cr6,.Lshort
+
+ /* Use .Llong loop if left cmp bytes are equal or greater than 32B */
+ cmpdi cr6,r5,31
bgt cr6,.Llong
+.Lcmploop_8bytes_31bytes:
+ /* handle 8 ~ 31 bytes with 8 bytes aligned addrs */
+ srdi. r0,r5,3
+ clrldi r5,r5,61
+ mtctr r0
+831:
+ LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr0,rA,rB
+ bne cr0,.LcmpAB_lightweight
+ addi r3,r3,8
+ addi r4,r4,8
+ bdnz 831b
+
+ cmpwi r5,0
+ beq .Lzero
+
.Lshort:
mtctr r5
@@ -232,4 +302,12 @@ _GLOBAL(memcmp)
ld r28,-32(r1)
ld r27,-40(r1)
blr
+
+.LcmpAB_lightweight: /* skip NV GPRS restore */
+ li r3,1
+ bgt cr0,8f
+ li r3,-1
+8:
+ blr
+
EXPORT_SYMBOL(memcmp)