Hi Matthias, Le 26/01/2011 11:45, Matthias Weisser a écrit : > Using optimized versions of memset and memcpy from linux brings a quite > noticeable speed (x2 or better) improvement for these two functions. > > Here are some numbers for test done with jadecpu > > | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)| > | | +patch | | +patch | > ---------------------------+--------+--------+--------+--------+ > Reset to prompt | 438ms | 330ms | 228ms | 120ms | > | | | | | > TFTP a 3MB img | 4782ms | 3428ms | 3245ms | 2820ms | > | | | | | > FATLOAD USB a 3MB img* | 8515ms | 8510ms | ------ | ------ | > | | | | | > BOOTM LZO img in RAM | 3473ms | 3168ms | 592ms | 592ms | > where CRC is | 615ms | 615ms | 54ms | 54ms | > uncompress | 2460ms | 2462ms | 450ms | 451ms | > final boot_elf | 376ms | 68ms | 65ms | 65ms | > | | | | | > BOOTM LZO img in FLASH | 3207ms | 2902ms | 1050ms | 1050ms | > where CRC is | 600ms | 600ms | 135ms | 135ms | > uncompress | 2209ms | 2211ms | 828ms | 828ms | > | | | | | > Copy 1.4MB from NOR to RAM | 134ms | 72ms | 120ms | 70ms | > > (1) No dcache > (2) dcache enabled in board_init > *Does not work when dcache is on > > Size impact: > > C version: > text data bss dec hex filename > 202862 18912 266456 488230 77326 u-boot > > ASM version: > text data bss dec hex filename > 203798 18912 266288 488998 77626 u-boot > 222712 u-boot.bin > > Changes since V1: > - Made the usage of these functions optional be CONFIG_USE_ARCH_MEM > - Usage of PLD instruction on all architectures supporting it > - Added a README entry > - Minor style fixes > > Signed-off-by: Matthias Weisser<weiss...@arcor.de> > ---
IIRC, the '---' line separates patch commit message (above) from freeform comments and history (below). Here, at least the version history should move below the '---' line. Also, I think that above the line, /some/ indication of performance enhancement and drawbacks should be given, but not a full ASCII table of numbers -- that can go below the line. > README | 6 + > arch/arm/include/asm/assembler.h | 60 ++++++++++ > arch/arm/include/asm/string.h | 10 ++- > arch/arm/lib/Makefile | 2 + > arch/arm/lib/memcpy.S | 241 > ++++++++++++++++++++++++++++++++++++++ > arch/arm/lib/memset.S | 126 ++++++++++++++++++++ > 6 files changed, 443 insertions(+), 2 deletions(-) > create mode 100644 arch/arm/include/asm/assembler.h > create mode 100644 arch/arm/lib/memcpy.S > create mode 100644 arch/arm/lib/memset.S > > diff --git a/README b/README > index 755d17c..5c610f2 100644 > --- a/README > +++ b/README > @@ -2885,6 +2885,12 @@ Low Level (hardware related) configuration options: > that is executed before the actual U-Boot. E.g. when > compiling a NAND SPL. > > +- CONFIG_USE_ARCH_MEMCPY > + CONFIG_USE_ARCH_MEMSET > + If these options are used a optimized version of memcpy/memset > will > + be used if available. These functions may be faster under some > + conditions but may increase the binary size. > + The name of the options is not self-explaining to me. If the difference is "generic vs arch-optimal", then maybe CONFIG_USE_ARCH_OPTIMAL_MEMxxx would be a better name? > Building the Software: > ====================== > > diff --git a/arch/arm/include/asm/assembler.h > b/arch/arm/include/asm/assembler.h > new file mode 100644 > index 0000000..418ee94 > --- /dev/null > +++ b/arch/arm/include/asm/assembler.h > @@ -0,0 +1,60 @@ > +/* > + * arch/arm/include/asm/assembler.h > + * > + * Copyright (C) 1996-2000 Russell King > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + * > + * This file contains arm architecture specific defines > + * for the different processors. > + * > + * Do not include any C declarations in this file - it is included by > + * assembler source. > + */ > + > +/* > + * Endian independent macros for shifting bytes within registers. > + */ > +#ifndef __ARMEB__ > +#define pull lsr > +#define push lsl > +#define get_byte_0 lsl #0 > +#define get_byte_1 lsr #8 > +#define get_byte_2 lsr #16 > +#define get_byte_3 lsr #24 > +#define put_byte_0 lsl #0 > +#define put_byte_1 lsl #8 > +#define put_byte_2 lsl #16 > +#define put_byte_3 lsl #24 > +#else > +#define pull lsl > +#define push lsr > +#define get_byte_0 lsr #24 > +#define get_byte_1 lsr #16 > +#define get_byte_2 lsr #8 > +#define get_byte_3 lsl #0 > +#define put_byte_0 lsl #24 > +#define put_byte_1 lsl #16 > +#define put_byte_2 lsl #8 > +#define put_byte_3 lsl #0 > +#endif > + > +/* > + * Data preload for architectures that support it > + */ > +#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ > + defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ > + defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || \ > + defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_7A__) || \ > + defined(__ARM_ARCH_7R__) > +#define PLD(code...) code > +#else > +#define PLD(code...) > +#endif > + > +/* > + * Cache alligned > + */ > +#define CALGN(code...) code > diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h > index c3ea582..c6dfb25 100644 > --- a/arch/arm/include/asm/string.h > +++ b/arch/arm/include/asm/string.h > @@ -1,6 +1,8 @@ > #ifndef __ASM_ARM_STRING_H > #define __ASM_ARM_STRING_H > > +#include<config.h> > + > /* > * We don't do inline string functions, since the > * optimised inline asm versions are not small. > @@ -12,7 +14,9 @@ extern char * strrchr(const char * s, int c); > #undef __HAVE_ARCH_STRCHR > extern char * strchr(const char * s, int c); > > -#undef __HAVE_ARCH_MEMCPY > +#ifdef CONFIG_USE_ARCH_MEMCPY > +#define __HAVE_ARCH_MEMCPY > +#endif > extern void * memcpy(void *, const void *, __kernel_size_t); > > #undef __HAVE_ARCH_MEMMOVE > @@ -22,7 +26,9 @@ extern void * memmove(void *, const void *, > __kernel_size_t); > extern void * memchr(const void *, int, __kernel_size_t); > > #undef __HAVE_ARCH_MEMZERO > -#undef __HAVE_ARCH_MEMSET > +#ifdef CONFIG_USE_ARCH_MEMSET > +#define __HAVE_ARCH_MEMSET > +#endif > extern void * memset(void *, int, __kernel_size_t); > > #if 0 > diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile > index 454440c..03b1b5e 100644 > --- a/arch/arm/lib/Makefile > +++ b/arch/arm/lib/Makefile > @@ -44,6 +44,8 @@ COBJS-y += cache-cp15.o > endif > COBJS-y += interrupts.o > COBJS-y += reset.o > +SOBJS-$(CONFIG_USE_ARCH_MEMSET) += memset.o > +SOBJS-$(CONFIG_USE_ARCH_MEMCPY) += memcpy.o > > SRCS := $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \ > $(SOBJS-y:.o=.S) $(COBJS-y:.o=.c) > diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S > new file mode 100644 > index 0000000..40db90e > --- /dev/null > +++ b/arch/arm/lib/memcpy.S > @@ -0,0 +1,241 @@ > +/* > + * linux/arch/arm/lib/memcpy.S > + * > + * Author: Nicolas Pitre > + * Created: Sep 28, 2005 > + * Copyright: MontaVista Software, Inc. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include<asm/assembler.h> > + > +#define W(instr) instr > + > +#define LDR1W_SHIFT 0 > +#define STR1W_SHIFT 0 > + > + .macro ldr1w ptr reg abort > + W(ldr) \reg, [\ptr], #4 > + .endm > + > + .macro ldr4w ptr reg1 reg2 reg3 reg4 abort > + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4} > + .endm > + > + .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort > + ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} > + .endm > + > + .macro ldr1b ptr reg cond=al abort > + ldr\cond\()b \reg, [\ptr], #1 > + .endm > + > + .macro str1w ptr reg abort > + W(str) \reg, [\ptr], #4 > + .endm > + > + .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort > + stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} > + .endm > + > + .macro str1b ptr reg cond=al abort > + str\cond\()b \reg, [\ptr], #1 > + .endm > + > + .macro enter reg1 reg2 > + stmdb sp!, {r0, \reg1, \reg2} > + .endm > + > + .macro exit reg1 reg2 > + ldmfd sp!, {r0, \reg1, \reg2} > + .endm > + > + .text > + > +/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */ > + > +.globl memcpy > +memcpy: > + > + enter r4, lr > + > + subs r2, r2, #4 > + blt 8f > + ands ip, r0, #3 > + PLD( pld [r1, #0] ) > + bne 9f > + ands ip, r1, #3 > + bne 10f > + > +1: subs r2, r2, #(28) > + stmfd sp!, {r5 - r8} > + blt 5f > + > + CALGN( ands ip, r0, #31 ) > + CALGN( rsb r3, ip, #32 ) > + CALGN( sbcnes r4, r3, r2 ) @ C is always set here > + CALGN( bcs 2f ) > + CALGN( adr r4, 6f ) > + CALGN( subs r2, r2, r3 ) @ C gets set > + CALGN( add pc, r4, ip ) > + > + PLD( pld [r1, #0] ) > +2: PLD( subs r2, r2, #96 ) > + PLD( pld [r1, #28] ) > + PLD( blt 4f ) > + PLD( pld [r1, #60] ) > + PLD( pld [r1, #92] ) > + > +3: PLD( pld [r1, #124] ) > +4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > + subs r2, r2, #32 > + str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f > + bge 3b > + PLD( cmn r2, #96 ) > + PLD( bge 4b ) > + > +5: ands ip, r2, #28 > + rsb ip, ip, #32 > +#if LDR1W_SHIFT> 0 > + lsl ip, ip, #LDR1W_SHIFT > +#endif > + addne pc, pc, ip @ C is always clear here > + b 7f > +6: > + .rept (1<< LDR1W_SHIFT) > + W(nop) > + .endr > + ldr1w r1, r3, abort=20f > + ldr1w r1, r4, abort=20f > + ldr1w r1, r5, abort=20f > + ldr1w r1, r6, abort=20f > + ldr1w r1, r7, abort=20f > + ldr1w r1, r8, abort=20f > + ldr1w r1, lr, abort=20f > + > +#if LDR1W_SHIFT< STR1W_SHIFT > + lsl ip, ip, #STR1W_SHIFT - LDR1W_SHIFT > +#elif LDR1W_SHIFT> STR1W_SHIFT > + lsr ip, ip, #LDR1W_SHIFT - STR1W_SHIFT > +#endif > + add pc, pc, ip > + nop > + .rept (1<< STR1W_SHIFT) > + W(nop) > + .endr > + str1w r0, r3, abort=20f > + str1w r0, r4, abort=20f > + str1w r0, r5, abort=20f > + str1w r0, r6, abort=20f > + str1w r0, r7, abort=20f > + str1w r0, r8, abort=20f > + str1w r0, lr, abort=20f > + > + CALGN( bcs 2b ) > + > +7: ldmfd sp!, {r5 - r8} > + > +8: movs r2, r2, lsl #31 > + ldr1b r1, r3, ne, abort=21f > + ldr1b r1, r4, cs, abort=21f > + ldr1b r1, ip, cs, abort=21f > + str1b r0, r3, ne, abort=21f > + str1b r0, r4, cs, abort=21f > + str1b r0, ip, cs, abort=21f > + > + exit r4, pc > + > +9: rsb ip, ip, #4 > + cmp ip, #2 > + ldr1b r1, r3, gt, abort=21f > + ldr1b r1, r4, ge, abort=21f > + ldr1b r1, lr, abort=21f > + str1b r0, r3, gt, abort=21f > + str1b r0, r4, ge, abort=21f > + subs r2, r2, ip > + str1b r0, lr, abort=21f > + blt 8b > + ands ip, r1, #3 > + beq 1b > + > +10: bic r1, r1, #3 > + cmp ip, #2 > + ldr1w r1, lr, abort=21f > + beq 17f > + bgt 18f > + > + > + .macro forward_copy_shift pull push > + > + subs r2, r2, #28 > + blt 14f > + > + CALGN( ands ip, r0, #31 ) > + CALGN( rsb ip, ip, #32 ) > + CALGN( sbcnes r4, ip, r2 ) @ C is always set here > + CALGN( subcc r2, r2, ip ) > + CALGN( bcc 15f ) > + > +11: stmfd sp!, {r5 - r9} > + > + PLD( pld [r1, #0] ) > + PLD( subs r2, r2, #96 ) > + PLD( pld [r1, #28] ) > + PLD( blt 13f ) > + PLD( pld [r1, #60] ) > + PLD( pld [r1, #92] ) > + > +12: PLD( pld [r1, #124] ) > +13: ldr4w r1, r4, r5, r6, r7, abort=19f > + mov r3, lr, pull #\pull > + subs r2, r2, #32 > + ldr4w r1, r8, r9, ip, lr, abort=19f > + orr r3, r3, r4, push #\push > + mov r4, r4, pull #\pull > + orr r4, r4, r5, push #\push > + mov r5, r5, pull #\pull > + orr r5, r5, r6, push #\push > + mov r6, r6, pull #\pull > + orr r6, r6, r7, push #\push > + mov r7, r7, pull #\pull > + orr r7, r7, r8, push #\push > + mov r8, r8, pull #\pull > + orr r8, r8, r9, push #\push > + mov r9, r9, pull #\pull > + orr r9, r9, ip, push #\push > + mov ip, ip, pull #\pull > + orr ip, ip, lr, push #\push > + str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f > + bge 12b > + PLD( cmn r2, #96 ) > + PLD( bge 13b ) > + > + ldmfd sp!, {r5 - r9} > + > +14: ands ip, r2, #28 > + beq 16f > + > +15: mov r3, lr, pull #\pull > + ldr1w r1, lr, abort=21f > + subs ip, ip, #4 > + orr r3, r3, lr, push #\push > + str1w r0, r3, abort=21f > + bgt 15b > + CALGN( cmp r2, #0 ) > + CALGN( bge 11b ) > + > +16: sub r1, r1, #(\push / 8) > + b 8b > + > + .endm > + > + > + forward_copy_shift pull=8 push=24 > + > +17: forward_copy_shift pull=16 push=16 > + > +18: forward_copy_shift pull=24 push=8 > + > diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S > new file mode 100644 > index 0000000..0cdf895 > --- /dev/null > +++ b/arch/arm/lib/memset.S > @@ -0,0 +1,126 @@ > +/* > + * linux/arch/arm/lib/memset.S > + * > + * Copyright (C) 1995-2000 Russell King > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + * > + * ASM optimised string functions > + */ > +#include<asm/assembler.h> > + > + .text > + .align 5 > + .word 0 > + > +1: subs r2, r2, #4 @ 1 do we have enough > + blt 5f @ 1 bytes to align with? > + cmp r3, #2 @ 1 > + strltb r1, [r0], #1 @ 1 > + strleb r1, [r0], #1 @ 1 > + strb r1, [r0], #1 @ 1 > + add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) > +/* > + * The pointer is now aligned and the length is adjusted. Try doing the > + * memset again. > + */ > + > +.globl memset > +memset: > + ands r3, r0, #3 @ 1 unaligned? > + bne 1b @ 1 > +/* > + * we know that the pointer in r0 is aligned to a word boundary. > + */ > + orr r1, r1, r1, lsl #8 > + orr r1, r1, r1, lsl #16 > + mov r3, r1 > + cmp r2, #16 > + blt 4f > + > +#if ! CALGN(1)+0 > + > +/* > + * We need an extra register for this loop - save the return address and > + * use the LR > + */ > + str lr, [sp, #-4]! > + mov ip, r1 > + mov lr, r1 > + > +2: subs r2, r2, #64 > + stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time. > + stmgeia r0!, {r1, r3, ip, lr} > + stmgeia r0!, {r1, r3, ip, lr} > + stmgeia r0!, {r1, r3, ip, lr} > + bgt 2b > + ldmeqfd sp!, {pc} @ Now<64 bytes to go. > +/* > + * No need to correct the count; we're only testing bits from now on > + */ > + tst r2, #32 > + stmneia r0!, {r1, r3, ip, lr} > + stmneia r0!, {r1, r3, ip, lr} > + tst r2, #16 > + stmneia r0!, {r1, r3, ip, lr} > + ldr lr, [sp], #4 > + > +#else > + > +/* > + * This version aligns the destination pointer in order to write > + * whole cache lines at once. > + */ > + > + stmfd sp!, {r4-r7, lr} > + mov r4, r1 > + mov r5, r1 > + mov r6, r1 > + mov r7, r1 > + mov ip, r1 > + mov lr, r1 > + > + cmp r2, #96 > + tstgt r0, #31 > + ble 3f > + > + and ip, r0, #31 > + rsb ip, ip, #32 > + sub r2, r2, ip > + movs ip, ip, lsl #(32 - 4) > + stmcsia r0!, {r4, r5, r6, r7} > + stmmiia r0!, {r4, r5} > + tst ip, #(1<< 30) > + mov ip, r1 > + strne r1, [r0], #4 > + > +3: subs r2, r2, #64 > + stmgeia r0!, {r1, r3-r7, ip, lr} > + stmgeia r0!, {r1, r3-r7, ip, lr} > + bgt 3b > + ldmeqfd sp!, {r4-r7, pc} > + > + tst r2, #32 > + stmneia r0!, {r1, r3-r7, ip, lr} > + tst r2, #16 > + stmneia r0!, {r4-r7} > + ldmfd sp!, {r4-r7, lr} > + > +#endif > + > +4: tst r2, #8 > + stmneia r0!, {r1, r3} > + tst r2, #4 > + strne r1, [r0], #4 > +/* > + * When we get here, we've got less than 4 bytes to zero. We > + * may have an unaligned pointer as well. > + */ > +5: tst r2, #2 > + strneb r1, [r0], #1 > + strneb r1, [r0], #1 > + tst r2, #1 > + strneb r1, [r0], #1 > + mov pc, lr Amicalement, -- Albert. _______________________________________________ U-Boot mailing list U-Boot@lists.denx.de http://lists.denx.de/mailman/listinfo/u-boot