There is no need to extend the set value to an int when the length
is lower than 4 as in that case we only do byte stores.
We can therefore immediately branch to the part handling it.
By separating it from the normal case, we are able to eliminate
a few actions on the destination pointer.

Signed-off-by: Christophe Leroy <christophe.le...@c-s.fr>
---
 arch/powerpc/lib/copy_32.S | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index a3ffeac69eca..05aaee20590f 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -91,17 +91,17 @@ EXPORT_SYMBOL(memset16)
  * replaced by a nop once cache is active. This is done in machine_init()
  */
 _GLOBAL(memset)
+       cmplwi  0,r5,4
+       blt     7f
+
        rlwimi  r4,r4,8,16,23
        rlwimi  r4,r4,16,0,15
 
-       addi    r6,r3,-4
-       cmplwi  0,r5,4
-       blt     7f
-       stwu    r4,4(r6)
+       stw     r4,0(r3)
        beqlr
-       andi.   r0,r6,3
+       andi.   r0,r3,3
        add     r5,r0,r5
-       subf    r6,r0,r6
+       subf    r6,r0,r3
        cmplwi  0,r4,0
        bne     2f      /* Use normal procedure if r4 is not zero */
 _GLOBAL(memset_nocache_branch)
@@ -132,13 +132,20 @@ _GLOBAL(memset_nocache_branch)
 1:     stwu    r4,4(r6)
        bdnz    1b
 6:     andi.   r5,r5,3
-7:     cmpwi   0,r5,0
        beqlr
        mtctr   r5
        addi    r6,r6,3
 8:     stbu    r4,1(r6)
        bdnz    8b
        blr
+
+7:     cmpwi   0,r5,0
+       beqlr
+       mtctr   r5
+       addi    r6,r3,-1
+9:     stbu    r4,1(r6)
+       bdnz    9b
+       blr
 EXPORT_SYMBOL(memset)
 
 /*
-- 
2.13.3

Reply via email to