This patch shortens non-constant memcpy() by two bytes and fixes spurious out-of-line constant memcpy().
Patch is run-tested (I run on patched kernel right now). Benchmark and code generation test program will be mailed as reply. # size vmlinux.org vmlinux text data bss dec hex filename 3954591 1553426 236544 5744561 57a7b1 vmlinux.org 3952615 1553426 236544 5742585 579ff9 vmlinux Example of changes (part of dump_fpu() body): old.............................................. new...................... 8d 83 40 02 00 00 lea 0x240(%ebx),%eax 8d b3 40 02 00 00 lea 0x240(%ebx),%esi 74 31 je c0108b27 <dump_fpu+0x9c> 74 2e je c0108b1d <dump_fpu+0x92> 6a 1c push $0x1c 8b 7d 0c mov 0xc(%ebp),%edi 50 push %eax b9 07 00 00 00 mov $0x7,%ecx 56 push %esi f3 a5 repz movsl %ds:(%esi),%es:(%edi) e8 49 21 10 00 call c020ac48 <memcpy> 8b 55 0c mov 0xc(%ebp),%edx 83 c4 0c add $0xc,%esp 83 c2 1c add $0x1c,%edx 83 c6 1c add $0x1c,%esi 8d 83 60 02 00 00 lea 0x260(%ebx),%eax 81 c3 60 02 00 00 add $0x260,%ebx b9 07 00 00 00 mov $0x7,%ecx bf 07 00 00 00 mov $0x7,%edi 89 d7 mov %edx,%edi 6a 0a push $0xa 89 c6 mov %eax,%esi 53 push %ebx a5 movsl %ds:(%esi),%es:(%edi) 56 push %esi a5 movsl %ds:(%esi),%es:(%edi) e8 2f 21 10 00 call c020ac48 <memcpy> 66 a5 movsw %ds:(%esi),%es:(%edi) 83 c4 0c add $0xc,%esp 83 c2 0a add $0xa,%edx 83 c6 0a add $0xa,%esi 83 c0 10 add $0x10,%eax 83 c3 10 add $0x10,%ebx 49 dec %ecx 4f dec %edi 79 ef jns c0108b0a <dump_fpu+0x7f> 79 eb jns c0108b10 <dump_fpu+0x85> eb 0a jmp c0108b27 <dump_fpu+0x9c> eb 0c jmp c0108b33 <dump_fpu+0xa8> 8b 7d 0c mov 0xc(%ebp),%edi 6a 6c push $0x6c b9 1b 00 00 00 mov $0x1b,%ecx 50 push %eax f3 a5 repz movsl %ds:(%esi),%es:(%edi) 56 push %esi 8b 45 f0 mov 0xfffffff0(%ebp),%eax e8 18 21 10 00 call c020ac48 <memcpy> 5a pop %edx 83 c4 0c add $0xc,%esp 5b pop %ebx 8b 45 f0 mov 0xfffffff0(%ebp),%eax 8d 65 f4 lea 0xfffffff4(%ebp),%esp 5b pop %ebx 5e pop %esi -- vda
--- linux-2.6.11.src/include/asm-i386/string.h.orig Thu Mar 3 09:31:08 2005 +++ linux-2.6.11.src/include/asm-i386/string.h Tue Mar 29 22:05:00 2005 @@ -198,46 +198,75 @@ static inline void * __memcpy(void * to, int d0, d1, d2; __asm__ __volatile__( "rep ; movsl\n\t" - "testb $2,%b4\n\t" - "je 1f\n\t" - "movsw\n" - "1:\ttestb $1,%b4\n\t" - "je 2f\n\t" - "movsb\n" - "2:" + "movl %4,%%ecx\n\t" + "andl $3,%%ecx\n\t" + "jz 1f\n\t" /* pay 2 byte penalty for a chance to skip microcoded rep */ + "rep ; movsb\n\t" + "1:" : "=&c" (d0), "=&D" (d1), "=&S" (d2) - :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from) + : "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from) : "memory"); return (to); } /* - * This looks horribly ugly, but the compiler can optimize it totally, + * This looks ugly, but the compiler can optimize it totally, * as the count is constant. */ static inline void * __constant_memcpy(void * to, const void * from, size_t n) { - if (n <= 128) - return __builtin_memcpy(to, from, n); - -#define COMMON(x) \ -__asm__ __volatile__( \ - "rep ; movsl" \ - x \ - : "=&c" (d0), "=&D" (d1), "=&S" (d2) \ - : "0" (n/4),"1" ((long) to),"2" ((long) from) \ - : "memory"); -{ - int d0, d1, d2; +#if 1 /* want to do small copies with non-string ops? */ + switch (n) { + case 0: return to; + case 1: *(char*)to = *(char*)from; return to; + case 2: *(short*)to = *(short*)from; return to; + case 4: *(int*)to = *(int*)from; return to; +#if 1 /* including those doable with two moves? */ + case 3: *(short*)to = *(short*)from; + *((char*)to+2) = *((char*)from+2); return to; + case 5: *(int*)to = *(int*)from; + *((char*)to+4) = *((char*)from+4); return to; + case 6: *(int*)to = *(int*)from; + *((short*)to+2) = *((short*)from+2); return to; + case 8: *(int*)to = *(int*)from; + *((int*)to+1) = *((int*)from+1); return to; +#endif + } +#else + if (!n) return to; +#endif + { + /* load esi/edi */ + int esi, edi; + __asm__ __volatile__( + "" + : "=&D" (edi), "=&S" (esi) + : "0" ((long) to),"1" ((long) from) + : "memory" + ); + } + if (n >= 5*4) { + /* large block: use rep prefix */ + int ecx; + __asm__ __volatile__( + "rep ; movsl" + : "=&c" (ecx) + : "0" (n/4) + ); + } else { + /* small block: don't clobber ecx + smaller code */ + if (n >= 4*4) __asm__ __volatile__("movsl"); + if (n >= 3*4) __asm__ __volatile__("movsl"); + if (n >= 2*4) __asm__ __volatile__("movsl"); + if (n >= 1*4) __asm__ __volatile__("movsl"); + } switch (n % 4) { - case 0: COMMON(""); return to; - case 1: COMMON("\n\tmovsb"); return to; - case 2: COMMON("\n\tmovsw"); return to; - default: COMMON("\n\tmovsw\n\tmovsb"); return to; + /* tail */ + case 0: return to; + case 1: __asm__ __volatile__("movsb"); return to; + case 2: __asm__ __volatile__("movsw"); return to; + default: __asm__ __volatile__("movsw\n\tmovsb"); return to; } -} - -#undef COMMON } #define __HAVE_ARCH_MEMCPY