On Saturday 02 April 2005 15:18, Denis Vlasenko wrote: > -O2 compile does inline copying, however, suboptimally. > Pushing/popping esi/edi on the stack is not needed. > Also "mov $1,ecx; rep; movsl" is rather silly.
I think I am wrong about push/pop. Sorry. However, other observation is still valid. You may wish to compile this updated t.c and see. -- vda
static inline void * __memcpy(void * to, const void * from, int n) { int d0, d1, d2; __asm__ __volatile__( "rep ; movsl\n\t" "movl %4,%%ecx\n\t" "andl $3,%%ecx\n\t" "jz 1f\n\t" /* pay 2 byte penalty for a chance to skip microcoded rep */ "rep ; movsb\n\t" "1:" : "=&c" (d0), "=&D" (d1), "=&S" (d2) : "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from) : "memory"); return (to); } /* * This looks ugly, but the compiler can optimize it totally, * as the count is constant. */ static inline void * __constant_memcpy(void * to, const void * from, int n) { #if 1 /* want to do small copies with non-string ops? */ switch (n) { case 0: return to; case 1: *(char*)to = *(char*)from; return to; case 2: *(short*)to = *(short*)from; return to; case 4: *(int*)to = *(int*)from; return to; #if 1 /* including those doable with two moves? */ case 3: *(short*)to = *(short*)from; *((char*)to+2) = *((char*)from+2); return to; case 5: *(int*)to = *(int*)from; *((char*)to+4) = *((char*)from+4); return to; case 6: *(int*)to = *(int*)from; *((short*)to+2) = *((short*)from+2); return to; case 8: *(int*)to = *(int*)from; *((int*)to+1) = *((int*)from+1); return to; #endif } #else if (!n) return to; #endif { /* load esi/edi */ int esi, edi; __asm__ __volatile__( "" : "=&D" (edi), "=&S" (esi) : "0" ((long) to),"1" ((long) from) : "memory" ); } if (n >= 5*4) { /* large block: use rep prefix */ int ecx; __asm__ __volatile__( "rep ; movsl" : "=&c" (ecx) : "0" (n/4) ); } else { /* small block: don't clobber ecx + smaller code */ if (n >= 4*4) __asm__ __volatile__("movsl"); if (n >= 3*4) __asm__ __volatile__("movsl"); if (n >= 2*4) __asm__ __volatile__("movsl"); if (n >= 1*4) __asm__ __volatile__("movsl"); } switch (n % 4) { /* tail */ case 0: return to; case 1: __asm__ __volatile__("movsb"); return to; case 2: __asm__ __volatile__("movsw"); return to; default: __asm__ __volatile__("movsw\n\tmovsb"); return to; } } #define memcpy(t, f, n) \ (__builtin_constant_p(n) ? \ __constant_memcpy((t),(f),(n)) : \ __memcpy((t),(f),(n))) #define STRUCT1(n) struct s##n { char c[n]; } v##n, w##n; void f##n(void) { v##n = w##n; } void g##n(void) { memcpy(&v##n,&w##n,n); } #define STRUCT(n) STRUCT1(n) STRUCT(1) STRUCT(2) STRUCT(3) STRUCT(4) STRUCT(5) STRUCT(6) STRUCT(7) STRUCT(8) STRUCT(9) STRUCT(10) STRUCT(11) STRUCT(12) STRUCT(13) STRUCT(14) STRUCT(15) STRUCT(16) STRUCT(17) STRUCT(18) STRUCT(19) STRUCT(20)