Albert, I guess it is too late for that now. Thought it would make it into 2015.01, since your last comment in v2 suggested that you would treat it as bugfix...
-- Stefan On 2014-12-18 18:10, Stefan Agner wrote: > Resynchronize memcpy/memset with kernel 3.17 and build them in > Thumb2 mode (unified syntax). Those assembler files can be built > and linked in ARM mode too, however when calling them from Thumb2 > built code, the stack got corrupted and the copy did not succeed > (the exact details have not been traced back). However, the Linux > kernel builds those files in Thumb2 mode. Hence U-Boot should > build them in Thumb2 mode too when CONFIG_SYS_THUMB_BUILD is set. > > To build the files without warning, some assembler instructions > had to be replaced with their UAL compliant variant (thanks > Jeroen for this input). > > To build the file in Thumb2 mode the implicit-it=always option need > to be set to generate Thumb2 compliant IT instructions where needed. > We add this option to the general AFLAGS when building for Thumb2. > > Reviewed-by: Simon Glass <s...@chromium.org> > Tested-by: Simon Glass <s...@chromium.org> > Signed-off-by: Stefan Agner <ste...@agner.ch> > --- > arch/arm/config.mk | 4 +- > arch/arm/include/asm/assembler.h | 33 ++++++++++-- > arch/arm/lib/memcpy.S | 80 +++++++++++++++++++--------- > arch/arm/lib/memset.S | 112 > ++++++++++++++++++++------------------- > 4 files changed, 142 insertions(+), 87 deletions(-) > > diff --git a/arch/arm/config.mk b/arch/arm/config.mk > index f0eafd6..9a4c270 100644 > --- a/arch/arm/config.mk > +++ b/arch/arm/config.mk > @@ -26,7 +26,9 @@ PLATFORM_CPPFLAGS += -D__ARM__ > > # Choose between ARM/Thumb instruction sets > ifeq ($(CONFIG_SYS_THUMB_BUILD),y) > -PF_CPPFLAGS_ARM := $(call cc-option, -mthumb -mthumb-interwork,\ > +AFLAGS_IMPLICIT_IT := $(call as-option,-Wa$(comma)-mimplicit-it=always) > +PF_CPPFLAGS_ARM := $(AFLAGS_IMPLICIT_IT) \ > + $(call cc-option, -mthumb -mthumb-interwork,\ > $(call cc-option,-marm,)\ > $(call cc-option,-mno-thumb-interwork,)\ > ) > diff --git a/arch/arm/include/asm/assembler.h > b/arch/arm/include/asm/assembler.h > index 5e4789b..11b80fb 100644 > --- a/arch/arm/include/asm/assembler.h > +++ b/arch/arm/include/asm/assembler.h > @@ -14,12 +14,14 @@ > * assembler source. > */ > > +#include <config.h> > + > /* > * Endian independent macros for shifting bytes within registers. > */ > #ifndef __ARMEB__ > -#define pull lsr > -#define push lsl > +#define lspull lsr > +#define lspush lsl > #define get_byte_0 lsl #0 > #define get_byte_1 lsr #8 > #define get_byte_2 lsr #16 > @@ -29,8 +31,8 @@ > #define put_byte_2 lsl #16 > #define put_byte_3 lsl #24 > #else > -#define pull lsl > -#define push lsr > +#define lspull lsl > +#define lspush lsr > #define get_byte_0 lsr #24 > #define get_byte_1 lsr #16 > #define get_byte_2 lsr #8 > @@ -54,7 +56,28 @@ > #define PLD(code...) > #endif > > + .irp c,,eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,hs,lo > + .macro ret\c, reg > +#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) > + mov\c pc, \reg > +#else > + .ifeqs "\reg", "lr" > + bx\c \reg > + .else > + mov\c pc, \reg > + .endif > +#endif > + .endm > + .endr > + > /* > - * Cache alligned > + * Cache aligned, used for optimized memcpy/memset > + * In the kernel this is only enabled for Feroceon CPU's... > + * We disable it especially for Thumb builds since those instructions > + * are not made in a Thumb ready way... > */ > +#ifdef CONFIG_SYS_THUMB_BUILD > +#define CALGN(code...) > +#else > #define CALGN(code...) code > +#endif > diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S > index f655256..eeaf003 100644 > --- a/arch/arm/lib/memcpy.S > +++ b/arch/arm/lib/memcpy.S > @@ -10,9 +10,14 @@ > * published by the Free Software Foundation. > */ > > +#include <linux/linkage.h> > #include <asm/assembler.h> > > +#ifdef CONFIG_SYS_THUMB_BUILD > +#define W(instr) instr.w > +#else > #define W(instr) instr > +#endif > > #define LDR1W_SHIFT 0 > #define STR1W_SHIFT 0 > @@ -30,7 +35,7 @@ > .endm > > .macro ldr1b ptr reg cond=al abort > - ldr\cond\()b \reg, [\ptr], #1 > + ldrb\cond\() \reg, [\ptr], #1 > .endm > > .macro str1w ptr reg abort > @@ -42,7 +47,7 @@ > .endm > > .macro str1b ptr reg cond=al abort > - str\cond\()b \reg, [\ptr], #1 > + strb\cond\() \reg, [\ptr], #1 > .endm > > .macro enter reg1 reg2 > @@ -56,10 +61,12 @@ > .text > > /* Prototype: void *memcpy(void *dest, const void *src, size_t n); */ > - > -.globl memcpy > -memcpy: > - > + .syntax unified > +#ifdef CONFIG_SYS_THUMB_BUILD > + .thumb > + .thumb_func > +#endif > +ENTRY(memcpy) > cmp r0, r1 > moveq pc, lr > > @@ -79,7 +86,7 @@ memcpy: > > CALGN( ands ip, r0, #31 ) > CALGN( rsb r3, ip, #32 ) > - CALGN( sbcnes r4, r3, r2 ) @ C is always set here > + CALGN( sbcsne r4, r3, r2 ) @ C is always set here > CALGN( bcs 2f ) > CALGN( adr r4, 6f ) > CALGN( subs r2, r2, r3 ) @ C gets set > @@ -178,7 +185,7 @@ memcpy: > > CALGN( ands ip, r0, #31 ) > CALGN( rsb ip, ip, #32 ) > - CALGN( sbcnes r4, ip, r2 ) @ C is always set here > + CALGN( sbcsne r4, ip, r2 ) @ C is always set here > CALGN( subcc r2, r2, ip ) > CALGN( bcc 15f ) > > @@ -193,24 +200,24 @@ memcpy: > > 12: PLD( pld [r1, #124] ) > 13: ldr4w r1, r4, r5, r6, r7, abort=19f > - mov r3, lr, pull #\pull > + mov r3, lr, lspull #\pull > subs r2, r2, #32 > ldr4w r1, r8, r9, ip, lr, abort=19f > - orr r3, r3, r4, push #\push > - mov r4, r4, pull #\pull > - orr r4, r4, r5, push #\push > - mov r5, r5, pull #\pull > - orr r5, r5, r6, push #\push > - mov r6, r6, pull #\pull > - orr r6, r6, r7, push #\push > - mov r7, r7, pull #\pull > - orr r7, r7, r8, push #\push > - mov r8, r8, pull #\pull > - orr r8, r8, r9, push #\push > - mov r9, r9, pull #\pull > - orr r9, r9, ip, push #\push > - mov ip, ip, pull #\pull > - orr ip, ip, lr, push #\push > + orr r3, r3, r4, lspush #\push > + mov r4, r4, lspull #\pull > + orr r4, r4, r5, lspush #\push > + mov r5, r5, lspull #\pull > + orr r5, r5, r6, lspush #\push > + mov r6, r6, lspull #\pull > + orr r6, r6, r7, lspush #\push > + mov r7, r7, lspull #\pull > + orr r7, r7, r8, lspush #\push > + mov r8, r8, lspull #\pull > + orr r8, r8, r9, lspush #\push > + mov r9, r9, lspull #\pull > + orr r9, r9, ip, lspush #\push > + mov ip, ip, lspull #\pull > + orr ip, ip, lr, lspush #\push > str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f > bge 12b > PLD( cmn r2, #96 ) > @@ -221,10 +228,10 @@ memcpy: > 14: ands ip, r2, #28 > beq 16f > > -15: mov r3, lr, pull #\pull > +15: mov r3, lr, lspull #\pull > ldr1w r1, lr, abort=21f > subs ip, ip, #4 > - orr r3, r3, lr, push #\push > + orr r3, r3, lr, lspush #\push > str1w r0, r3, abort=21f > bgt 15b > CALGN( cmp r2, #0 ) > @@ -241,3 +248,24 @@ memcpy: > 17: forward_copy_shift pull=16 push=16 > > 18: forward_copy_shift pull=24 push=8 > + > + > +/* > + * Abort preamble and completion macros. > + * If a fixup handler is required then those macros must surround it. > + * It is assumed that the fixup code will handle the private part of > + * the exit macro. > + */ > + > + .macro copy_abort_preamble > +19: ldmfd sp!, {r5 - r9} > + b 21f > +20: ldmfd sp!, {r5 - r8} > +21: > + .endm > + > + .macro copy_abort_end > + ldmfd sp!, {r4, pc} > + .endm > + > +ENDPROC(memcpy) > diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S > index 0cdf895..7208f20 100644 > --- a/arch/arm/lib/memset.S > +++ b/arch/arm/lib/memset.S > @@ -9,32 +9,25 @@ > * > * ASM optimised string functions > */ > +#include <linux/linkage.h> > #include <asm/assembler.h> > > .text > .align 5 > - .word 0 > > -1: subs r2, r2, #4 @ 1 do we have enough > - blt 5f @ 1 bytes to align with? > - cmp r3, #2 @ 1 > - strltb r1, [r0], #1 @ 1 > - strleb r1, [r0], #1 @ 1 > - strb r1, [r0], #1 @ 1 > - add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) > -/* > - * The pointer is now aligned and the length is adjusted. Try doing the > - * memset again. > - */ > - > -.globl memset > -memset: > + .syntax unified > +#ifdef CONFIG_SYS_THUMB_BUILD > + .thumb > + .thumb_func > +#endif > +ENTRY(memset) > ands r3, r0, #3 @ 1 unaligned? > - bne 1b @ 1 > + mov ip, r0 @ preserve r0 as return value > + bne 6f @ 1 > /* > - * we know that the pointer in r0 is aligned to a word boundary. > + * we know that the pointer in ip is aligned to a word boundary. > */ > - orr r1, r1, r1, lsl #8 > +1: orr r1, r1, r1, lsl #8 > orr r1, r1, r1, lsl #16 > mov r3, r1 > cmp r2, #16 > @@ -43,29 +36,28 @@ memset: > #if ! CALGN(1)+0 > > /* > - * We need an extra register for this loop - save the return address and > - * use the LR > + * We need 2 extra registers for this loop - use r8 and the LR > */ > - str lr, [sp, #-4]! > - mov ip, r1 > + stmfd sp!, {r8, lr} > + mov r8, r1 > mov lr, r1 > > 2: subs r2, r2, #64 > - stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time. > - stmgeia r0!, {r1, r3, ip, lr} > - stmgeia r0!, {r1, r3, ip, lr} > - stmgeia r0!, {r1, r3, ip, lr} > + stmiage ip!, {r1, r3, r8, lr} @ 64 bytes at a time. > + stmiage ip!, {r1, r3, r8, lr} > + stmiage ip!, {r1, r3, r8, lr} > + stmiage ip!, {r1, r3, r8, lr} > bgt 2b > - ldmeqfd sp!, {pc} @ Now <64 bytes to go. > + ldmfdeq sp!, {r8, pc} @ Now <64 bytes to go. > /* > * No need to correct the count; we're only testing bits from now on > */ > tst r2, #32 > - stmneia r0!, {r1, r3, ip, lr} > - stmneia r0!, {r1, r3, ip, lr} > + stmiane ip!, {r1, r3, r8, lr} > + stmiane ip!, {r1, r3, r8, lr} > tst r2, #16 > - stmneia r0!, {r1, r3, ip, lr} > - ldr lr, [sp], #4 > + stmiane ip!, {r1, r3, r8, lr} > + ldmfd sp!, {r8, lr} > > #else > > @@ -74,53 +66,63 @@ memset: > * whole cache lines at once. > */ > > - stmfd sp!, {r4-r7, lr} > + stmfd sp!, {r4-r8, lr} > mov r4, r1 > mov r5, r1 > mov r6, r1 > mov r7, r1 > - mov ip, r1 > + mov r8, r1 > mov lr, r1 > > cmp r2, #96 > - tstgt r0, #31 > + tstgt ip, #31 > ble 3f > > - and ip, r0, #31 > - rsb ip, ip, #32 > - sub r2, r2, ip > - movs ip, ip, lsl #(32 - 4) > - stmcsia r0!, {r4, r5, r6, r7} > - stmmiia r0!, {r4, r5} > - tst ip, #(1 << 30) > - mov ip, r1 > - strne r1, [r0], #4 > + and r8, ip, #31 > + rsb r8, r8, #32 > + sub r2, r2, r8 > + movs r8, r8, lsl #(32 - 4) > + stmiacs ip!, {r4, r5, r6, r7} > + stmiami ip!, {r4, r5} > + tst r8, #(1 << 30) > + mov r8, r1 > + strne r1, [ip], #4 > > 3: subs r2, r2, #64 > - stmgeia r0!, {r1, r3-r7, ip, lr} > - stmgeia r0!, {r1, r3-r7, ip, lr} > + stmiage ip!, {r1, r3-r8, lr} > + stmiage ip!, {r1, r3-r8, lr} > bgt 3b > - ldmeqfd sp!, {r4-r7, pc} > + ldmfdeq sp!, {r4-r8, pc} > > tst r2, #32 > - stmneia r0!, {r1, r3-r7, ip, lr} > + stmiane ip!, {r1, r3-r8, lr} > tst r2, #16 > - stmneia r0!, {r4-r7} > - ldmfd sp!, {r4-r7, lr} > + stmiane ip!, {r4-r7} > + ldmfd sp!, {r4-r8, lr} > > #endif > > 4: tst r2, #8 > - stmneia r0!, {r1, r3} > + stmiane ip!, {r1, r3} > tst r2, #4 > - strne r1, [r0], #4 > + strne r1, [ip], #4 > /* > * When we get here, we've got less than 4 bytes to zero. We > * may have an unaligned pointer as well. > */ > 5: tst r2, #2 > - strneb r1, [r0], #1 > - strneb r1, [r0], #1 > + strbne r1, [ip], #1 > + strbne r1, [ip], #1 > tst r2, #1 > - strneb r1, [r0], #1 > - mov pc, lr > + strbne r1, [ip], #1 > + ret lr > + > +6: subs r2, r2, #4 @ 1 do we have enough > + blt 5b @ 1 bytes to align with? > + cmp r3, #2 @ 1 > + strblt r1, [ip], #1 @ 1 > + strble r1, [ip], #1 @ 1 > + strb r1, [ip], #1 @ 1 > + add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) > + b 1b > +ENDPROC(memset) _______________________________________________ U-Boot mailing list U-Boot@lists.denx.de http://lists.denx.de/mailman/listinfo/u-boot