Sorry for answering out of thread, but the message is long gone. I just remembered very old x86 code of mine.
More than fifteen years ago it was still faster to go for 32-bit wide testing when more than 20 bytes had to be compared, and to redo a short byte loop to work around the fact that the carry flag stats the wrong byteorder for the matching slot. Maybe of interest. |Andrew Doran <a...@netbsd.org> writes: |> Hi, |> |> Change backed out. Sorry about the disruption. /// MEMCMP - sir (*)(const void *_ba, const void *_bb, uir _bytes) #undef FUN #undef FUN_STR #define FUN __XXXXXX_mem_Compare #define FUN_STR "sir XXXXXXX::Mem::Utils::Compare(const void*,const void*,uir)" ASSERT_FUNVARS_STR() NYD_FUNVARS_STR() .global G(FUN) .type G(FUN), @function G(FUN): pushl %edi pushl %esi .if __ALL GET_GOT() NYDIN() .endif .if SF_DEBUG movl PICSO(12)(%esp), %eax // _ba testl %eax, %eax jnz 1f ASSERT_CRASH("_ba != NIL") 1: movl PICSO(16)(%esp), %eax // _bb testl %eax, %eax jnz 2f ASSERT_CRASH("_bb != NIL") 2: .endif // load args (_ba, _bb, _bytes) movl __PICSO(12)(%esp), %esi movl __PICSO(16)(%esp), %edi movl __PICSO(20)(%esp), %ecx cld // forward cried the man from the rear xorl %edx, %edx // default return cmpl $20, %ecx // byte loop? jle 7f 1: // align at least one on ui4 boundary; use a bytewise loop for that testl $3, %esi jz 2f cmpsb jne 8f // query result (CF) decl %ecx jmp 1b 2: // perform a uir loop; does not help us much due to the little endian // byte order, but gives us at least an equality indication.. // (and is much faster than the byteloop ...) movl %ecx, %eax // save bytecount shrl $2, %ecx // >> Register::shift repz cmpsl jne 3f movb %al, %cl // restore rem. bytecount andl $3, %ecx // max. two bits remain (<= 3) jz 9f jmp 7f // to the byte loop please 3: // we have found an unequal slot, but CF aka the result is based on // the "wrong" byte order. this is not easy to solve, thus simply // adjust the pointers and the count and restart the byte loop. // doing so is easier than the other thinkable approaches? movl $4, %eax // avoid immediate ops.. subl %eax, %esi subl %eax, %edi addl %eax, %ecx 7: // byte loop repz cmpsb je 9f 8: // have result, calculate it accordingly (edx is still 0) sbbl %edx, %edx // a -= b+CF --> 0 || -1 (borrow sub) orb $1, %dl // 1 or still -1 9: // and finalize movl %edx, %eax // overtake result into eax .if __ALL NYDOUT() UNGET_GOT() .endif popl %esi popl %edi ret .size G(FUN), .-G(FUN) .align 16 // /__XXXXXX_mem_Compare --steffen | |Der Kragenbaer, The moon bear, |der holt sich munter he cheerfully and one by one |einen nach dem anderen runter wa.ks himself off |(By Robert Gernhardt)