cacheable_memcpy uses dcbz instruction and is more efficient than memcpy when the destination is in RAM
This patch renames memcpy as generic_memcpy, and defines memcpy as a prolog to cacheable_memcpy. This prolog checks if the buffer is in RAM. If not, it falls back to generic_memcpy() On MPC885, we get approximatly 7% increase of the transfer rate on an FTP reception Signed-off-by: Christophe Leroy <christophe.le...@c-s.fr> --- arch/powerpc/lib/copy_32.S | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S index d8a9a86..8f76d49 100644 --- a/arch/powerpc/lib/copy_32.S +++ b/arch/powerpc/lib/copy_32.S @@ -161,13 +161,27 @@ _GLOBAL(generic_memset) * We only use this version if the source and dest don't overlap. * -- paulus. */ +_GLOBAL(memmove) + cmplw 0,r3,r4 + bgt backwards_memcpy + /* fall through */ + +_GLOBAL(memcpy) + cmplwi r5,L1_CACHE_BYTES + blt- generic_memcpy + lis r8,max_pfn@ha + lwz r8,max_pfn@l(r8) + tophys (r9,r3) + srwi r9,r9,PAGE_SHIFT + cmplw r9,r8 + bge- generic_memcpy _GLOBAL(cacheable_memcpy) add r7,r3,r5 /* test if the src & dst overlap */ add r8,r4,r5 cmplw 0,r4,r7 cmplw 1,r3,r8 crand 0,0,4 /* cr0.lt &= cr1.lt */ - blt memcpy /* if regions overlap */ + blt generic_memcpy /* if regions overlap */ addi r4,r4,-4 addi r6,r3,-4 @@ -233,12 +247,7 @@ _GLOBAL(cacheable_memcpy) bdnz 40b 65: blr -_GLOBAL(memmove) - cmplw 0,r3,r4 - bgt backwards_memcpy - /* fall through */ - -_GLOBAL(memcpy) +_GLOBAL(generic_memcpy) srwi. r7,r5,3 addi r6,r3,-4 addi r4,r4,-4 -- 2.1.0 _______________________________________________ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev