Sorry for answering out of thread, but the message is long gone.
I just remembered very old x86 code of mine.

More than fifteen years ago it was still faster to go for 32-bit
wide testing when more than 20 bytes had to be compared, and to
redo a short byte loop to work around the fact that the carry flag
stats the wrong byteorder for the matching slot.
Maybe of interest.

 |Andrew Doran <a...@netbsd.org> writes:
 |> Hi,
 |>
 |> Change backed out.  Sorry about the disruption.

/// MEMCMP - sir (*)(const void *_ba, const void *_bb, uir _bytes)
#undef FUN
#undef FUN_STR
#define FUN             __XXXXXX_mem_Compare
#define FUN_STR "sir XXXXXXX::Mem::Utils::Compare(const void*,const void*,uir)"
ASSERT_FUNVARS_STR()
NYD_FUNVARS_STR()
.global G(FUN)
.type   G(FUN), @function
G(FUN):
        pushl %edi
        pushl %esi
.if __ALL
        GET_GOT()
        NYDIN()
.endif
.if SF_DEBUG
        movl PICSO(12)(%esp), %eax      // _ba
        testl %eax, %eax
        jnz 1f
        ASSERT_CRASH("_ba != NIL")
1:
        movl PICSO(16)(%esp), %eax      // _bb
        testl %eax, %eax
        jnz 2f
        ASSERT_CRASH("_bb != NIL")
2:
.endif
        // load args (_ba, _bb, _bytes)
        movl __PICSO(12)(%esp), %esi
        movl __PICSO(16)(%esp), %edi
        movl __PICSO(20)(%esp), %ecx
        cld                             // forward cried the man from the rear
        xorl %edx, %edx                 // default return
        cmpl $20, %ecx                  // byte loop?
        jle 7f
1:      // align at least one on ui4 boundary; use a bytewise loop for that
        testl $3, %esi
        jz 2f
        cmpsb
        jne 8f                          // query result (CF)
        decl %ecx
        jmp 1b
2:      // perform a uir loop; does not help us much due to the little endian
        // byte order, but gives us at least an equality indication..
        // (and is much faster than the byteloop ...)
        movl %ecx, %eax                 // save bytecount
        shrl $2, %ecx                   // >> Register::shift
        repz cmpsl
        jne 3f
        movb %al, %cl                   // restore rem. bytecount
        andl $3, %ecx                   // max. two bits remain (<= 3)
        jz 9f
        jmp 7f                          // to the byte loop please
3:      // we have found an unequal slot, but CF aka the result is based on
        // the "wrong" byte order.  this is not easy to solve, thus simply
        // adjust the pointers and the count and restart the byte loop.
        // doing so is easier than the other thinkable approaches?
        movl $4, %eax                   // avoid immediate ops..
        subl %eax, %esi
        subl %eax, %edi
        addl %eax, %ecx
7:      // byte loop
        repz cmpsb
        je 9f
8:      // have result, calculate it accordingly (edx is still 0)
        sbbl %edx, %edx                 // a -= b+CF --> 0 || -1 (borrow sub)
        orb $1, %dl                     // 1 or still -1
9:      // and finalize
        movl %edx, %eax                 // overtake result into eax
.if __ALL
        NYDOUT()
        UNGET_GOT()
.endif
        popl %esi
        popl %edi
        ret
.size   G(FUN), .-G(FUN)
.align  16
// /__XXXXXX_mem_Compare

--steffen
|
|Der Kragenbaer,                The moon bear,
|der holt sich munter           he cheerfully and one by one
|einen nach dem anderen runter  wa.ks himself off
|(By Robert Gernhardt)

Reply via email to