Hi to all, I made this patch as some people request using 486 optimized string routines for older (486 and 586) machines. Actually i made 2 things: - replaced buggy macro definitions for memset and memcpy which caused compiler break. Also paranoia check added for (counter == 0); - rewrote these functions in order to speed them up and reduce the code size; To whom is interested - please send me comments, bug reports or ideas. The goal is to make kernel use these procedures when running on 486 or 586 processors, so please test... best, Petkan
--- linux-2.4.0-test7/include/asm-i386/string-486.h.orig Tue Aug 29 11:10:51 2000 +++ linux/include/asm-i386/string-486.h Thu Aug 31 16:10:11 2000 @@ -16,10 +16,11 @@ * Split into 2 CPU specific files by Alan Cox to keep #ifdef noise down. * * 1999/10/5 Proper register args for newer GCCs and minor bugs - * fixed - Petko Manolov ([EMAIL PROTECTED]) + * fixed - Petko Manolov ([EMAIL PROTECTED]) * 1999/10/14 3DNow memscpy() added - Petkan * 2000/05/09 extern changed to static in function definitions * and a few cleanups - Petkan + * 2000/08/29 memset and memcpy rewritten - Petkan */ #define __HAVE_ARCH_STRCPY @@ -273,79 +274,60 @@ /* end of additional stuff */ -/* - * These ought to get tweaked to do some cache priming. - */ - -static inline void * __memcpy_by4(void * to, const void * from, size_t n) +static inline void *__memcpy(void * to, const void *from, size_t len) { -register void *tmp = (void *)to; -register int dummy1,dummy2; -__asm__ __volatile__ ( - "\n1:\tmovl (%2),%0\n\t" - "addl $4,%2\n\t" - "movl %0,(%1)\n\t" - "addl $4,%1\n\t" - "decl %3\n\t" - "jnz 1b" - :"=r" (dummy1), "=r" (tmp), "=r" (from), "=r" (dummy2) - :"1" (tmp), "2" (from), "3" (n/4) - :"memory"); -return (to); -} + int d0,d1,d2; -static inline void * __memcpy_by2(void * to, const void * from, size_t n) -{ -register void *tmp = (void *)to; -register int dummy1,dummy2; -__asm__ __volatile__ ( - "shrl $1,%3\n\t" - "jz 2f\n" /* only a word */ - "1:\tmovl (%2),%0\n\t" - "addl $4,%2\n\t" - "movl %0,(%1)\n\t" - "addl $4,%1\n\t" - "decl %3\n\t" - "jnz 1b\n" - "2:\tmovw (%2),%w0\n\t" - "movw %w0,(%1)" - :"=r" (dummy1), "=r" (tmp), "=r" (from), "=r" (dummy2) - :"1" (tmp), "2" (from), "3" (n/2) - :"memory"); -return (to); -} - -static inline void * __memcpy_g(void * to, const void * from, size_t n) -{ -int d0, d1, d2; -register void *tmp = (void *)to; -__asm__ __volatile__ ( - "shrl $1,%%ecx\n\t" - "jnc 1f\n\t" - "movsb\n" - "1:\tshrl $1,%%ecx\n\t" - "jnc 2f\n\t" + __asm__ __volatile__ ( + "rep; movsl\n\t" + "testb $2,%b3\n\t" + "jz 1f\n\t" "movsw\n" - "2:\trep\n\t" - "movsl" + "1:\t" + "testb $1,%b3\n\t" + "jz 2f\n\t" + "movsb\n" + "2:" :"=&c" (d0), "=&D" (d1), "=&S" (d2) - :"0" (n), "1" ((long) tmp), "2" ((long) from) - :"memory"); -return (to); + :"q" (len), "0" (len/4), "1" (to), "2" (from) + :"memory" + ); + + return to; } -#define __memcpy_c(d,s,count) \ -((count%4==0) ? \ - __memcpy_by4((d),(s),(count)) : \ - ((count%2==0) ? \ - __memcpy_by2((d),(s),(count)) : \ - __memcpy_g((d),(s),(count)))) - -#define __memcpy(d,s,count) \ -(__builtin_constant_p(count) ? \ - __memcpy_c((d),(s),(count)) : \ - __memcpy_g((d),(s),(count))) - + +static inline void *__constant_memcpy(void * to, const void *from, size_t len) +{ + int d0,d1,d2; + register int tmp; + +#define MEMCP(x) \ + __asm__ volatile ( \ + "\n1:\t" \ + "movl (%0),%3\n\t" \ + "movl %3,(%1)\n\t" \ + "addl $4,%1\n\t" \ + "addl $4,%0\n\t" \ + "decl %2\n\t" \ + "jnz 1b\n\t" \ + x \ + :"=r" (d0), "=r" (d1), "=r" (d2),"=q" (tmp)\ + :"0" (from), "1" (to), "2" (len/4) \ + :"memory" \ + ) + + switch ( len % 4 ) { + case 0: MEMCP(""); return to; + case 1: MEMCP("movb (%0),%b3; movb %b3,(%1)"); return to; + case 2: MEMCP("movw (%0),%w3; movw %w3,(%1)"); return to; + default: MEMCP("movw (%0),%w3; movw %w3,(%1)\n\t" + "movb 2(%0),%b3; movb %b3,2(%1)"); return to; + } +#undef MEMCP +} + + #define __HAVE_ARCH_MEMCPY #include <linux/config.h> @@ -363,24 +345,26 @@ ** This CPU favours 3DNow strongly (eg AMD K6-II, K6-III, Athlon) */ -static inline void * __constant_memcpy3d(void * to, const void * from, size_t len) +static inline void *__memcpy3d(void *to, const void *from, size_t len) { if(len<512 || in_interrupt()) - return __memcpy_c(to, from, len); + return __memcpy(to, from, len); return _mmx_memcpy(to, from, len); } -static inline void *__memcpy3d(void *to, const void *from, size_t len) + +static inline void *__constant_memcpy3d(void *to, const void *from, size_t len) { if(len<512 || in_interrupt()) - return __memcpy_g(to, from, len); + return __constant_memcpy(to, from, len); return _mmx_memcpy(to, from, len); } -#define memcpy(d, s, count) \ -(__builtin_constant_p(count) ? \ - __constant_memcpy3d((d),(s),(count)) : \ - __memcpy3d((d),(s),(count))) + +#define memcpy(d, s, count) \ + (__builtin_constant_p(count) && count) ?\ + __constant_memcpy3d( d, s, count ) : \ + __memcpy3d( d, s, count ) #else /* CONFIG_X86_USE_3DNOW */ @@ -389,7 +373,10 @@ */ -#define memcpy(d, s, count) __memcpy(d, s, count) +#define memcpy(d, s, count) \ + (__builtin_constant_p(count) && count) ?\ + __constant_memcpy( d, s, count ) : \ + __memcpy( d, s, count ) #endif /* CONFIG_X86_USE_3DNOW */ @@ -429,22 +416,7 @@ } -#define __HAVE_ARCH_MEMCMP -static inline int memcmp(const void * cs,const void * ct,size_t count) -{ -int d0, d1, d2; -register int __res; -__asm__ __volatile__( - "repe\n\t" - "cmpsb\n\t" - "je 1f\n\t" - "sbbl %0,%0\n\t" - "orb $1,%b0\n" - "1:" - :"=a" (__res), "=&S" (d0), "=&D" (d1), "=&c" (d2) - :"0" (0), "1" (cs), "2" (ct), "3" (count)); -return __res; -} +#define memcmp __builtin_memcmp #define __HAVE_ARCH_MEMCHR @@ -465,141 +437,57 @@ return __res; } -#define __memset_cc(s,c,count) \ -((count%4==0) ? \ - __memset_cc_by4((s),(c),(count)) : \ - ((count%2==0) ? \ - __memset_cc_by2((s),(c),(count)) : \ - __memset_cg((s),(c),(count)))) - -#define __memset_gc(s,c,count) \ -((count%4==0) ? \ - __memset_gc_by4((s),(c),(count)) : \ - ((count%2==0) ? \ - __memset_gc_by2((s),(c),(count)) : \ - __memset_gg((s),(c),(count)))) - -#define __HAVE_ARCH_MEMSET -#define memset(s,c,count) \ -(__builtin_constant_p(c) ? \ - (__builtin_constant_p(count) ? \ - __memset_cc((s),(c),(count)) : \ - __memset_cg((s),(c),(count))) : \ - (__builtin_constant_p(count) ? \ - __memset_gc((s),(c),(count)) : \ - __memset_gg((s),(c),(count)))) -static inline void * __memset_cc_by4(void * s, char c, size_t count) +static inline void *__memset_generic( void *s, char c, size_t count ) { -/* - * register char *tmp = s; - */ -register char *tmp = (char *)s; -register int dummy; -__asm__ __volatile__ ( - "\n1:\tmovl %2,(%0)\n\t" - "addl $4,%0\n\t" - "decl %1\n\t" - "jnz 1b" - :"=r" (tmp), "=r" (dummy) - :"q" (0x01010101UL * (unsigned char) c), "0" (tmp), "1" (count/4) - :"memory"); -return s; -} + int d0,d1; -static inline void * __memset_cc_by2(void * s, char c, size_t count) -{ -register void *tmp = (void *)s; -register int dummy; -__asm__ __volatile__ ( - "shrl $1,%1\n\t" /* may be divisible also by 4 */ - "jz 2f\n" - "\n1:\tmovl %2,(%0)\n\t" - "addl $4,%0\n\t" - "decl %1\n\t" - "jnz 1b\n" - "2:\tmovw %w2,(%0)" - :"=r" (tmp), "=r" (dummy) - :"q" (0x01010101UL * (unsigned char) c), "0" (tmp), "1" (count/2) - :"memory"); -return s; + __asm__ volatile ( + "rep\n\t" + "stosb\n\t" + :"=&c" (d0), "=&D" (d1) + :"a" (c), "0" (count), "1" (s) + :"memory" + ); + + return s; } -static inline void * __memset_gc_by4(void * s, char c, size_t count) -{ -register void *tmp = (void *)s; -register int dummy; -__asm__ __volatile__ ( - "movb %b0,%h0\n" - "pushw %w0\n\t" - "shll $16,%0\n\t" - "popw %w0\n" - "1:\tmovl %0,(%1)\n\t" - "addl $4,%1\n\t" - "decl %2\n\t" - "jnz 1b\n" - :"=q" (c), "=r" (tmp), "=r" (dummy) - :"0" ((unsigned) c), "1" (tmp), "2" (count/4) - :"memory"); -return s; -} -static inline void * __memset_gc_by2(void * s, char c, size_t count) -{ -register void *tmp = (void *)s; -register int dummy1,dummy2; -__asm__ __volatile__ ( - "movb %b0,%h0\n\t" - "shrl $1,%2\n\t" /* may be divisible also by 4 */ - "jz 2f\n\t" - "pushw %w0\n\t" - "shll $16,%0\n\t" - "popw %w0\n" - "1:\tmovl %0,(%1)\n\t" - "addl $4,%1\n\t" - "decl %2\n\t" - "jnz 1b\n" - "2:\tmovw %w0,(%1)" - :"=q" (dummy1), "=r" (tmp), "=r" (dummy2) - :"0" ((unsigned) c), "1" (tmp), "2" (count/2) - :"memory"); -return s; +static inline void *__memset_constant( void *s, char c, size_t count ) +{ + int d0,d1; + +#define MEMST(x) \ + __asm__ volatile ( \ + "\n1:\t" \ + "movl %2,(%0)\n\t" \ + "addl $4,%0\n\t" \ + "decl %1\n\t" \ + "jnz 1b\n" \ + x \ + :"=r" (d0), "=r" (d1) \ + :"q" (0x01010101UL * ((unsigned char)c)), "0" (s), "1" (count/4) \ + :"memory" \ + ) + + switch ( count % 4 ) { + case 0: MEMST(""); return s; + case 1: MEMST("\tmovb %b2,(%0)"); return s; + case 2: MEMST("\tmovw %w2,(%0)"); return s; + default: MEMST("\tmovw %w2,(%0)\n\tmovb %b2,2(%0)"); return s; + } } -static inline void * __memset_cg(void * s, char c, size_t count) -{ -int d0, d1; -register void *tmp = (void *)s; -__asm__ __volatile__ ( - "shrl $1,%%ecx\n\t" - "rep\n\t" - "stosw\n\t" - "jnc 1f\n\t" - "movb %%al,(%%edi)\n" - "1:" - :"=&c" (d0), "=&D" (d1) - :"a" (0x0101U * (unsigned char) c), "0" (count), "1" (tmp) - :"memory"); -return s; -} -static inline void * __memset_gg(void * s,char c,size_t count) -{ -int d0, d1, d2; -register void *tmp = (void *)s; -__asm__ __volatile__ ( - "movb %%al,%%ah\n\t" - "shrl $1,%%ecx\n\t" - "rep\n\t" - "stosw\n\t" - "jnc 1f\n\t" - "movb %%al,(%%edi)\n" - "1:" - :"=&c" (d0), "=&D" (d1), "=&D" (d2) - :"0" (count), "1" (tmp), "2" (c) - :"memory"); -return s; -} +#define __memset( s, c, count ) \ + (__builtin_constant_p( count ) && count ) ? \ + __memset_constant( s, c, count ) : \ + __memset_generic( s, c, count ) + +#define __HAVE_ARCH_MEMSET +#define memset( s, c, count ) __memset( s, c, count ) + /*