From: Simon Guo <wei.guo.si...@gmail.com> Currently memcmp() in powerpc will fall back to .Lshort (compare per byte mode) if either src or dst address is not 8 bytes aligned. It can be opmitized if both addresses are with the same offset with 8 bytes boundary.
memcmp() can align the src/dst address with 8 bytes firstly and then compare with .Llong mode. This patch optmizes memcmp() behavior in this situation. Test result: (1) 256 bytes Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp: - without patch 50.715169506 seconds time elapsed ( +- 0.04% ) - with patch 28.906602373 seconds time elapsed ( +- 0.02% ) -> There is ~+75% percent improvement. (2) 32 bytes To observe performance impact on < 32 bytes, modify tools/testing/selftests/powerpc/stringloops/memcmp.c with following: ------- #include <string.h> #include "utils.h" -#define SIZE 256 +#define SIZE 32 #define ITERATIONS 10000 int test_memcmp(const void *s1, const void *s2, size_t n); -------- - Without patch 0.390677136 seconds time elapsed ( +- 0.03% ) - with patch 0.375685926 seconds time elapsed ( +- 0.05% ) -> There is ~+4% improvement (3) 0~8 bytes To observe <8 bytes performance impact, modify tools/testing/selftests/powerpc/stringloops/memcmp.c with following: ------- #include <string.h> #include "utils.h" -#define SIZE 256 -#define ITERATIONS 10000 +#define SIZE 8 +#define ITERATIONS 100000 int test_memcmp(const void *s1, const void *s2, size_t n); ------- - Without patch 3.169203981 seconds time elapsed ( +- 0.23% ) - With patch 3.208257362 seconds time elapsed ( +- 0.13% ) -> There is ~ -1% decrease. (I don't know why yet, since there are the same number of instructions in the code path for 0~8 bytes memcmp() with/without this patch. Any comments will be appreciated). Signed-off-by: Simon Guo <wei.guo.si...@gmail.com> --- arch/powerpc/lib/memcmp_64.S | 86 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 82 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S index d75d18b..6dbafdb 100644 --- a/arch/powerpc/lib/memcmp_64.S +++ b/arch/powerpc/lib/memcmp_64.S @@ -24,25 +24,95 @@ #define rH r31 #ifdef __LITTLE_ENDIAN__ +#define LH lhbrx +#define LW lwbrx #define LD ldbrx #else +#define LH lhzx +#define LW lwzx #define LD ldx #endif _GLOBAL(memcmp) cmpdi cr1,r5,0 - /* Use the short loop if both strings are not 8B aligned */ - or r6,r3,r4 + /* Use the short loop if the src/dst addresses are not + * with the same offset of 8 bytes align boundary. + */ + xor r6,r3,r4 andi. r6,r6,7 - /* Use the short loop if length is less than 32B */ - cmpdi cr6,r5,31 + /* fall back to short loop if compare at aligned addrs + * with no greater than 8 bytes. + */ + cmpdi cr6,r5,8 beq cr1,.Lzero bne .Lshort + ble cr6,.Lshort + +.Lalignbytes_start: + /* The bits 0/1/2 of src/dst addr are the same. */ + neg r0,r3 + andi. r0,r0,7 + beq .Lalign8bytes + + PPC_MTOCRF(1,r0) + bf 31,.Lalign2bytes + lbz rA,0(r3) + lbz rB,0(r4) + cmplw cr0,rA,rB + bne cr0,.LcmpAB_lightweight + addi r3,r3,1 + addi r4,r4,1 + subi r5,r5,1 +.Lalign2bytes: + bf 30,.Lalign4bytes + LH rA,0,r3 + LH rB,0,r4 + cmplw cr0,rA,rB + bne cr0,.LcmpAB_lightweight + bne .Lnon_zero + addi r3,r3,2 + addi r4,r4,2 + subi r5,r5,2 +.Lalign4bytes: + bf 29,.Lalign8bytes + LW rA,0,r3 + LW rB,0,r4 + cmpld cr0,rA,rB + bne cr0,.LcmpAB_lightweight + addi r3,r3,4 + addi r4,r4,4 + subi r5,r5,4 +.Lalign8bytes: + /* Now addrs are aligned with 8 bytes. Use the short loop if left + * bytes are less than 8B. + */ + cmpdi cr6,r5,7 + ble cr6,.Lshort + + /* Use .Llong loop if left cmp bytes are equal or greater than 32B */ + cmpdi cr6,r5,31 bgt cr6,.Llong +.Lcmploop_8bytes_31bytes: + /* handle 8 ~ 31 bytes with 8 bytes aligned addrs */ + srdi. r0,r5,3 + clrldi r5,r5,61 + mtctr r0 +831: + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + bne cr0,.LcmpAB_lightweight + addi r3,r3,8 + addi r4,r4,8 + bdnz 831b + + cmpwi r5,0 + beq .Lzero + .Lshort: mtctr r5 @@ -232,4 +302,12 @@ _GLOBAL(memcmp) ld r28,-32(r1) ld r27,-40(r1) blr + +.LcmpAB_lightweight: /* skip NV GPRS restore */ + li r3,1 + bgt cr0,8f + li r3,-1 +8: + blr + EXPORT_SYMBOL(memcmp) -- 1.8.3.1