Hi to all,

I made this patch as some people request using
486 optimized string routines for older
(486 and 586) machines.

Actually i made 2 things:
        - replaced buggy macro definitions for
        memset and memcpy which caused compiler
        break. Also paranoia check added for (counter == 0);
        - rewrote these functions in order to speed
        them up and reduce the code size;

To whom is interested - please send me comments, bug reports
or ideas.
The goal is to make kernel use these procedures when running
on 486 or 586 processors, so please test...


best,
Petkan
--- linux-2.4.0-test7/include/asm-i386/string-486.h.orig        Tue Aug 29 11:10:51 
2000
+++ linux/include/asm-i386/string-486.h Thu Aug 31 16:10:11 2000
@@ -16,10 +16,11 @@
  *     Split into 2 CPU specific files by Alan Cox to keep #ifdef noise down.
  *
  *     1999/10/5       Proper register args for newer GCCs and minor bugs
- *                     fixed - Petko Manolov ([EMAIL PROTECTED])
+ *                     fixed - Petko Manolov ([EMAIL PROTECTED])
  *     1999/10/14      3DNow memscpy() added - Petkan
  *     2000/05/09      extern changed to static in function definitions
  *                     and a few cleanups - Petkan
+ *     2000/08/29      memset and memcpy rewritten - Petkan
  */
 
 #define __HAVE_ARCH_STRCPY
@@ -273,79 +274,60 @@
 /* end of additional stuff */
 
 
-/*
- *     These ought to get tweaked to do some cache priming.
- */
- 
-static inline void * __memcpy_by4(void * to, const void * from, size_t n)
+static inline void *__memcpy(void * to, const void *from, size_t len)
 {
-register void *tmp = (void *)to;
-register int dummy1,dummy2;
-__asm__ __volatile__ (
-       "\n1:\tmovl (%2),%0\n\t"
-       "addl $4,%2\n\t"
-       "movl %0,(%1)\n\t"
-       "addl $4,%1\n\t"
-       "decl %3\n\t"
-       "jnz 1b"
-       :"=r" (dummy1), "=r" (tmp), "=r" (from), "=r" (dummy2) 
-       :"1" (tmp), "2" (from), "3" (n/4)
-       :"memory");
-return (to);
-}
+       int     d0,d1,d2;
 
-static inline void * __memcpy_by2(void * to, const void * from, size_t n)
-{
-register void *tmp = (void *)to;
-register int dummy1,dummy2;
-__asm__ __volatile__ (
-       "shrl $1,%3\n\t"
-       "jz 2f\n"                 /* only a word */
-       "1:\tmovl (%2),%0\n\t"
-       "addl $4,%2\n\t"
-       "movl %0,(%1)\n\t"
-       "addl $4,%1\n\t"
-       "decl %3\n\t"
-       "jnz 1b\n"
-       "2:\tmovw (%2),%w0\n\t"
-       "movw %w0,(%1)"
-       :"=r" (dummy1), "=r" (tmp), "=r" (from), "=r" (dummy2) 
-       :"1" (tmp), "2" (from), "3" (n/2)
-       :"memory");
-return (to);
-}
-
-static inline void * __memcpy_g(void * to, const void * from, size_t n)
-{
-int    d0, d1, d2;
-register void *tmp = (void *)to;
-__asm__ __volatile__ (
-       "shrl $1,%%ecx\n\t"
-       "jnc 1f\n\t"
-       "movsb\n"
-       "1:\tshrl $1,%%ecx\n\t"
-       "jnc 2f\n\t"
+       __asm__ __volatile__ (
+       "rep; movsl\n\t"
+       "testb  $2,%b3\n\t"
+       "jz     1f\n\t"
        "movsw\n"
-       "2:\trep\n\t"
-       "movsl"
+       "1:\t"
+       "testb  $1,%b3\n\t"
+       "jz     2f\n\t"
+       "movsb\n"
+       "2:"
        :"=&c" (d0), "=&D" (d1), "=&S" (d2)
-       :"0" (n), "1" ((long) tmp), "2" ((long) from)
-       :"memory");
-return (to);
+       :"q" (len), "0" (len/4), "1" (to), "2" (from)
+       :"memory"
+       );
+       
+       return  to;
 }
 
-#define __memcpy_c(d,s,count) \
-((count%4==0) ? \
- __memcpy_by4((d),(s),(count)) : \
- ((count%2==0) ? \
-  __memcpy_by2((d),(s),(count)) : \
-  __memcpy_g((d),(s),(count))))
-  
-#define __memcpy(d,s,count) \
-(__builtin_constant_p(count) ? \
- __memcpy_c((d),(s),(count)) : \
- __memcpy_g((d),(s),(count)))
- 
+
+static inline void *__constant_memcpy(void * to, const void *from, size_t len)
+{
+       int     d0,d1,d2;
+       register int    tmp;
+       
+#define        MEMCP(x)                \
+       __asm__ volatile (      \
+       "\n1:\t"                \
+       "movl   (%0),%3\n\t"    \
+       "movl   %3,(%1)\n\t"    \
+       "addl   $4,%1\n\t"      \
+       "addl   $4,%0\n\t"      \
+       "decl   %2\n\t"         \
+       "jnz    1b\n\t"         \
+       x                       \
+       :"=r" (d0), "=r" (d1), "=r" (d2),"=q" (tmp)\
+       :"0" (from), "1" (to), "2" (len/4) \
+       :"memory"               \
+       )
+       
+       switch ( len % 4 ) {
+               case 0: MEMCP(""); return to;
+               case 1: MEMCP("movb (%0),%b3; movb %b3,(%1)"); return to;
+               case 2: MEMCP("movw (%0),%w3; movw %w3,(%1)"); return to;
+               default: MEMCP("movw (%0),%w3; movw %w3,(%1)\n\t"
+                               "movb 2(%0),%b3; movb %b3,2(%1)"); return to;
+       }
+#undef MEMCP
+}
+
+
 #define __HAVE_ARCH_MEMCPY
 
 #include <linux/config.h>
@@ -363,24 +345,26 @@
 **      This CPU favours 3DNow strongly (eg AMD K6-II, K6-III, Athlon)
 */
 
-static inline void * __constant_memcpy3d(void * to, const void * from, size_t len)
+static inline void *__memcpy3d(void *to, const void *from, size_t len)
 {
        if(len<512 || in_interrupt())
-               return __memcpy_c(to, from, len);
+               return __memcpy(to, from, len);
        return _mmx_memcpy(to, from, len);
 }
 
-static inline void *__memcpy3d(void *to, const void *from, size_t len)
+
+static inline void *__constant_memcpy3d(void *to, const void *from, size_t len)
 {
        if(len<512 || in_interrupt())
-               return __memcpy_g(to, from, len);
+               return __constant_memcpy(to, from, len);
        return _mmx_memcpy(to, from, len);
 }
 
-#define memcpy(d, s, count) \
-(__builtin_constant_p(count) ? \
- __constant_memcpy3d((d),(s),(count)) : \
- __memcpy3d((d),(s),(count)))
+
+#define memcpy(d, s, count)                    \
+       (__builtin_constant_p(count) && count) ?\
+       __constant_memcpy3d( d, s, count ) :    \
+       __memcpy3d( d, s, count )
  
 #else /* CONFIG_X86_USE_3DNOW */
 
@@ -389,7 +373,10 @@
 */
 
 
-#define memcpy(d, s, count) __memcpy(d, s, count)
+#define memcpy(d, s, count)                    \
+       (__builtin_constant_p(count) && count) ?\
+       __constant_memcpy( d, s, count ) :      \
+       __memcpy( d, s, count )
 
 #endif /* CONFIG_X86_USE_3DNOW */ 
 
@@ -429,22 +416,7 @@
 }
 
 
-#define        __HAVE_ARCH_MEMCMP
-static inline int memcmp(const void * cs,const void * ct,size_t count)
-{
-int    d0, d1, d2;
-register int __res;
-__asm__ __volatile__(
-       "repe\n\t"
-       "cmpsb\n\t"
-       "je 1f\n\t"
-       "sbbl %0,%0\n\t"
-       "orb $1,%b0\n"
-       "1:"
-       :"=a" (__res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
-       :"0" (0), "1" (cs), "2" (ct), "3" (count));
-return __res;
-}
+#define memcmp         __builtin_memcmp
 
 
 #define __HAVE_ARCH_MEMCHR
@@ -465,141 +437,57 @@
 return __res;
 }
 
-#define __memset_cc(s,c,count) \
-((count%4==0) ? \
- __memset_cc_by4((s),(c),(count)) : \
- ((count%2==0) ? \
-  __memset_cc_by2((s),(c),(count)) : \
-  __memset_cg((s),(c),(count))))
-
-#define __memset_gc(s,c,count) \
-((count%4==0) ? \
- __memset_gc_by4((s),(c),(count)) : \
- ((count%2==0) ? \
-  __memset_gc_by2((s),(c),(count)) : \
-  __memset_gg((s),(c),(count))))
-
-#define __HAVE_ARCH_MEMSET
-#define memset(s,c,count) \
-(__builtin_constant_p(c) ? \
- (__builtin_constant_p(count) ? \
-  __memset_cc((s),(c),(count)) : \
-  __memset_cg((s),(c),(count))) : \
- (__builtin_constant_p(count) ? \
-  __memset_gc((s),(c),(count)) : \
-  __memset_gg((s),(c),(count))))
 
-static inline void * __memset_cc_by4(void * s, char c, size_t count)
+static inline void *__memset_generic( void *s, char c, size_t count )
 {
-/*
- * register char *tmp = s;
- */
-register char *tmp = (char *)s;
-register int  dummy;
-__asm__ __volatile__ (
-       "\n1:\tmovl %2,(%0)\n\t"
-       "addl $4,%0\n\t"
-       "decl %1\n\t"
-       "jnz 1b"
-       :"=r" (tmp), "=r" (dummy)
-       :"q" (0x01010101UL * (unsigned char) c), "0" (tmp), "1" (count/4)
-       :"memory");
-return s;
-}
+       int     d0,d1;
 
-static inline void * __memset_cc_by2(void * s, char c, size_t count)
-{
-register void *tmp = (void *)s;
-register int  dummy;
-__asm__ __volatile__ (
-       "shrl $1,%1\n\t"          /* may be divisible also by 4 */
-       "jz 2f\n"
-       "\n1:\tmovl %2,(%0)\n\t"
-       "addl $4,%0\n\t"
-       "decl %1\n\t"
-       "jnz 1b\n"
-       "2:\tmovw %w2,(%0)"
-       :"=r" (tmp), "=r" (dummy)
-       :"q" (0x01010101UL * (unsigned char) c), "0" (tmp), "1" (count/2)
-       :"memory");
-return s;
+       __asm__ volatile (
+       "rep\n\t"
+       "stosb\n\t"
+       :"=&c" (d0), "=&D" (d1)
+       :"a" (c), "0" (count), "1" (s)
+       :"memory"
+       );
+       
+       return  s;
 }
 
-static inline void * __memset_gc_by4(void * s, char c, size_t count)
-{
-register void *tmp = (void *)s;
-register int dummy;
-__asm__ __volatile__ (
-       "movb %b0,%h0\n"
-       "pushw %w0\n\t"
-       "shll $16,%0\n\t"
-       "popw %w0\n"
-       "1:\tmovl %0,(%1)\n\t"
-       "addl $4,%1\n\t"
-       "decl %2\n\t"
-       "jnz 1b\n"
-       :"=q" (c), "=r" (tmp), "=r" (dummy)
-       :"0" ((unsigned) c),  "1"  (tmp), "2" (count/4)
-       :"memory");
-return s;
-}
 
-static inline void * __memset_gc_by2(void * s, char c, size_t count)
-{
-register void *tmp = (void *)s;
-register int dummy1,dummy2;
-__asm__ __volatile__ (
-       "movb %b0,%h0\n\t"
-       "shrl $1,%2\n\t"          /* may be divisible also by 4 */
-       "jz 2f\n\t"
-       "pushw %w0\n\t"
-       "shll $16,%0\n\t"
-       "popw %w0\n"
-       "1:\tmovl %0,(%1)\n\t"
-       "addl $4,%1\n\t"
-       "decl %2\n\t"
-       "jnz 1b\n"
-       "2:\tmovw %w0,(%1)"
-       :"=q" (dummy1), "=r" (tmp), "=r" (dummy2)
-       :"0" ((unsigned) c),  "1"  (tmp), "2" (count/2)
-       :"memory");
-return s;
+static inline void *__memset_constant( void *s, char c, size_t count )
+{
+       int     d0,d1;
+       
+#define        MEMST(x)                \
+       __asm__ volatile (      \
+       "\n1:\t"                \
+       "movl   %2,(%0)\n\t"    \
+       "addl   $4,%0\n\t"      \
+       "decl   %1\n\t"         \
+       "jnz    1b\n"           \
+       x                       \
+       :"=r" (d0), "=r" (d1)   \
+       :"q" (0x01010101UL * ((unsigned char)c)), "0" (s), "1" (count/4) \
+       :"memory"               \
+       )
+       
+       switch ( count % 4 ) {
+               case 0: MEMST(""); return s;
+               case 1: MEMST("\tmovb   %b2,(%0)"); return s;
+               case 2: MEMST("\tmovw   %w2,(%0)"); return s;
+               default: MEMST("\tmovw  %w2,(%0)\n\tmovb %b2,2(%0)"); return s;
+       }
 }
 
-static inline void * __memset_cg(void * s, char c, size_t count)
-{
-int    d0, d1;
-register void *tmp = (void *)s;
-__asm__ __volatile__ (
-       "shrl $1,%%ecx\n\t"
-       "rep\n\t"
-       "stosw\n\t"
-       "jnc 1f\n\t"
-       "movb %%al,(%%edi)\n"
-       "1:"
-       :"=&c" (d0), "=&D" (d1) 
-       :"a" (0x0101U * (unsigned char) c), "0" (count), "1" (tmp)
-       :"memory");
-return s;
-}
 
-static inline void * __memset_gg(void * s,char c,size_t count)
-{
-int    d0, d1, d2;
-register void *tmp = (void *)s;
-__asm__ __volatile__ (
-       "movb %%al,%%ah\n\t"
-       "shrl $1,%%ecx\n\t"
-       "rep\n\t"
-       "stosw\n\t"
-       "jnc 1f\n\t"
-       "movb %%al,(%%edi)\n"
-       "1:"
-       :"=&c" (d0), "=&D" (d1), "=&D" (d2)
-       :"0" (count), "1" (tmp), "2" (c)
-       :"memory");
-return s;
-}
+#define        __memset( s, c, count )                 \
+       (__builtin_constant_p( count ) && count ) ?     \
+               __memset_constant( s, c, count ) :      \
+               __memset_generic( s, c, count )
+
+#define __HAVE_ARCH_MEMSET
+#define memset( s, c, count ) __memset( s, c, count )
+
 
 
 /*

Reply via email to