From: Simon Guo <wei.guo.si...@gmail.com>

This patch add VMX primitives to do memcmp() in case the compare size
exceeds 4K bytes.

Test result with following test program:
------
tools/testing/selftests/powerpc/stringloops# cat memcmp.c

int test_memcmp(const void *s1, const void *s2, size_t n);

static int testcase(void)
{
        char *s1;
        char *s2;
        unsigned long i;

        s1 = memalign(128, SIZE);
        if (!s1) {
                perror("memalign");
                exit(1);
        }

        s2 = memalign(128, SIZE);
        if (!s2) {
                perror("memalign");
                exit(1);
        }

        for (i = 0; i < SIZE; i++)  {
                s1[i] = i & 0xff;
                s2[i] = i & 0xff;
        }
        for (i = 0; i < ITERATIONS; i++)
                test_memcmp(s1, s2, SIZE);

        return 0;
}

int main(void)
{
        return test_harness(testcase, "memcmp");
}

------
Without VMX patch:
       5.085776331 seconds time elapsed                                         
 ( +-  0.28% )
With VMX patch:
       4.584002052 seconds time elapsed                                         
 ( +-  0.02% )

                There is ~10% improvement.

However I am not aware whether there is use case in kernel for memcmp on
large size yet.

Signed-off-by: Simon Guo <wei.guo.si...@gmail.com>
---
 arch/powerpc/include/asm/asm-prototypes.h |  2 +-
 arch/powerpc/lib/copypage_power7.S        |  2 +-
 arch/powerpc/lib/memcmp_64.S              | 79 +++++++++++++++++++++++++++++++
 arch/powerpc/lib/memcpy_power7.S          |  2 +-
 arch/powerpc/lib/vmx-helper.c             |  2 +-
 5 files changed, 83 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index 7330150..e6530d8 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -49,7 +49,7 @@ void __trace_hcall_exit(long opcode, unsigned long retval,
 /* VMX copying */
 int enter_vmx_usercopy(void);
 int exit_vmx_usercopy(void);
-int enter_vmx_copy(void);
+int enter_vmx_ops(void);
 void * exit_vmx_copy(void *dest);
 
 /* Traps */
diff --git a/arch/powerpc/lib/copypage_power7.S 
b/arch/powerpc/lib/copypage_power7.S
index ca5fc8f..9e7729e 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -60,7 +60,7 @@ _GLOBAL(copypage_power7)
        std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
        std     r0,16(r1)
        stdu    r1,-STACKFRAMESIZE(r1)
-       bl      enter_vmx_copy
+       bl      enter_vmx_ops
        cmpwi   r3,0
        ld      r0,STACKFRAMESIZE+16(r1)
        ld      r3,STK_REG(R31)(r1)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index 6dbafdb..b86a1d3 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -153,6 +153,13 @@ _GLOBAL(memcmp)
        blr
 
 .Llong:
+#ifdef CONFIG_ALTIVEC
+       /* Try to use vmx loop if length is larger than 4K */
+       cmpldi  cr6,r5,4096
+       bgt     cr6,.Lvmx_cmp
+
+.Llong_novmx_cmp:
+#endif
        li      off8,8
        li      off16,16
        li      off24,24
@@ -310,4 +317,76 @@ _GLOBAL(memcmp)
 8:
        blr
 
+#ifdef CONFIG_ALTIVEC
+.Lvmx_cmp:
+       mflr    r0
+       std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+       std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
+       std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
+       std     r0,16(r1)
+       stdu    r1,-STACKFRAMESIZE(r1)
+       bl      enter_vmx_ops
+       cmpwi   cr1,r3,0
+       ld      r0,STACKFRAMESIZE+16(r1)
+       ld      r3,STK_REG(R31)(r1)
+       ld      r4,STK_REG(R30)(r1)
+       ld      r5,STK_REG(R29)(r1)
+       addi    r1,r1,STACKFRAMESIZE
+       mtlr    r0
+       beq     cr1,.Llong_novmx_cmp
+
+3:
+       /* Enter with src/dst address 8 bytes aligned, and len is
+        * no less than 4KB. Need to align with 16 bytes further.
+        */
+       andi.   rA,r3,8
+       beq     4f
+       LD      rA,0,r3
+       LD      rB,0,r4
+       cmpld   cr0,rA,rB
+       bne     cr0,.LcmpAB_lightweight
+
+       addi    r3,r3,8
+       addi    r4,r4,8
+
+4:
+       /* compare 32 bytes for each loop */
+       srdi    r0,r5,5
+       mtctr   r0
+       andi.   r5,r5,31
+       li      off16,16
+5:
+       lvx     v0,0,r3
+       lvx     v1,0,r4
+       vcmpequd. v0,v0,v1
+       bf      24,7f
+       lvx     v0,off16,r3
+       lvx     v1,off16,r4
+       vcmpequd. v0,v0,v1
+       bf      24,6f
+       addi    r3,r3,32
+       addi    r4,r4,32
+       bdnz    5b
+
+       cmpdi   r5,0
+       beq     .Lzero
+       b       .Lshort
+
+6:
+       addi    r3,r3,16
+       addi    r4,r4,16
+
+7:
+       LD      rA,0,r3
+       LD      rB,0,r4
+       cmpld   cr0,rA,rB
+       bne     cr0,.LcmpAB_lightweight
+
+       li      off8,8
+       LD      rA,off8,r3
+       LD      rB,off8,r4
+       cmpld   cr0,rA,rB
+       bne     cr0,.LcmpAB_lightweight
+       b       .Lzero
+#endif
 EXPORT_SYMBOL(memcmp)
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
index 193909a..682e386 100644
--- a/arch/powerpc/lib/memcpy_power7.S
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -230,7 +230,7 @@ _GLOBAL(memcpy_power7)
        std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
        std     r0,16(r1)
        stdu    r1,-STACKFRAMESIZE(r1)
-       bl      enter_vmx_copy
+       bl      enter_vmx_ops
        cmpwi   cr1,r3,0
        ld      r0,STACKFRAMESIZE+16(r1)
        ld      r3,STK_REG(R31)(r1)
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index bf925cd..923a9ab 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -53,7 +53,7 @@ int exit_vmx_usercopy(void)
        return 0;
 }
 
-int enter_vmx_copy(void)
+int enter_vmx_ops(void)
 {
        if (in_interrupt())
                return 0;
-- 
1.8.3.1

Reply via email to