Hi,

After trying to implement memcmp for big endian I realized that its
exactly whats needed.

As on big endian we could just load them into register and directly
compare them as ordering is same then for little endian we could just
first use bswap for big-endian case.

Following expansion should be close to optimal on architectures that
have unaligned loads. It isn't worth do it without these as emulating
them bloats implementation size, so its better do libcall.

I was surprised when I checked how it expands pattern
if (memcmp(x,y,n)) 
that it optimizes byteswap away, expansion of that looks like best
possible.

It could be improved on other platforms. First is that I ignore 16bit
operations as they tend to be slow. Instead I do an overlapping load
of maximal size. I could add table of expansion without overlap and see
whats faster.

For generic memcmp assemly could be made shorter as gcc duplicates
tails. For first 8 bytes I use fact that its likely so I start with
converting them, assembly of memcmp(x,y,23) is following.

        movq    (%rdi), %rax
        movq    (%rsi), %rdx
.here:  
        bswap   %rax
        bswap   %rdx
        cmpq    %rdx, %rax
        je      .L22
.L20:
        cmpq    %rax, %rdx
.L21:
        sbbl    %eax, %eax
        andl    $2, %eax
        subl    $1, %eax
        ret
.L22:
        movq    8(%rdi), %rax
        movq    8(%rsi), %rdx
        cmpq    %rdx, %rax
        je      .L15
        bswap   %rdx
        bswap   %rax
        jmp     .L20
...

You could save space by changing that into 

        movq    8(%rdi), %rax
        movq    8(%rsi), %rdx
        cmpq    %rdx, %rax
        jne      .here
        movq    15(%rdi), %rax
        movq    15(%rsi), %rdx
        cmpq    %rdx, %rax
        jne      .here
        xor     %rax, %rax
        ret

Also there is bug that you duplicate comparison:

        cmpq    %rdx, %rax
        je      .L22
.L20:
        cmpq    %rax, %rdx

Comments?


#include <string.h>
#include <stdint.h>

#undef memcmp
#define memcmp(x, y, n) (__builtin_constant_p (n) && n < 64 ? __memcmp_inline 
(x, y, n) \
                         : memcmp (x, y, n))

#define LOAD8(x) (*((uint8_t *) (x)))
#define LOAD32(x) (*((uint32_t *) (x)))
#define LOAD64(x) (*((uint64_t *) (x)))

#define CHECK(tp, n)
#if __BYTE_ORDER == __LITTLE_ENDIAN
# define SWAP32(x) __builtin_bswap32 (LOAD32 (x))
# define SWAP64(x) __builtin_bswap64 (LOAD64 (x))
#else
# define SWAP32(x) LOAD32 (x)
# define SWAP64(x) LOAD64 (x)
#endif

#define __ARCH_64BIT 1

static __always_inline
int
check (uint64_t x, uint64_t y)
{
  if (x == y)
    return 0;
  if (x > y)
    return 1;

  return -1;
}

static __always_inline
int
check_nonzero (uint64_t x, uint64_t y)
{
  if (x > y)
    return 1;

  return -1;
}


static __always_inline
int
__memcmp_inline (void *x, void *y, size_t n)
{
#define CHECK1 if (LOAD8 (x + i) - LOAD8 (y + i)) \
    return check_nonzero (LOAD8 (x + i), LOAD8 (y + i)); i = i + 1;
#define CHECK4 if (i == 0 ? SWAP32 (x + i) - SWAP32 (y + i)\
                      : LOAD32 (x + i) - LOAD32 (y + i)) \
    return check_nonzero (SWAP32 (x + i), SWAP32 (y + i)); i = i + 4;
#define CHECK8 if (i == 0 ? SWAP64 (x + i) - SWAP64 (y + i)\
                      : LOAD64 (x + i) - LOAD64 (y + i)) \
    return check_nonzero (SWAP64 (x + i), SWAP64 (y + i)); i = i + 8;

#define CHECK1FINAL(o) return check (LOAD8 (x + i + o), LOAD8 (y + i + o));
#define CHECK4FINAL(o) return check (SWAP32 (x + i + o), SWAP32 (y + i + o));
#define CHECK8FINAL(o) return check (SWAP64 (x + i + o), SWAP64 (y + i + o));

#if __ARCH_64BIT == 0
# undef CHECK8
# undef CHECK8FINAL
# define CHECK8 CHECK4 CHECK4
# define CHECK8FINAL(o) CHECK4 CHECK4FINAL (o)
#endif

#define LOOP if (i + 8 < n) { CHECK8 } \
if (i + 8 < n) { CHECK8 } \
if (i + 8 < n) { CHECK8 } \
if (i + 8 < n) { CHECK8 } \
if (i + 8 < n) { CHECK8 } \
if (i + 8 < n) { CHECK8 } \
if (i + 8 < n) { CHECK8 } \
if (i + 8 < n) { CHECK8 } 


  long i = 0;

  switch (n % 8)
    {
    case 0:
      if (n == 0)
        return 0;

      LOOP; CHECK8FINAL (0);
    case 1:
      LOOP CHECK1FINAL (0);
    case 2:
      if (n == 2)
        {
          CHECK1 CHECK1FINAL (0);
        }
      LOOP CHECK4FINAL (-2);
    case 3:
      if (n == 3)
        {
          CHECK1 CHECK1 CHECK1FINAL (0);
        }
      LOOP CHECK4FINAL (-1);
    case 4:
      LOOP CHECK4FINAL (0);
    case 5:
      if (n == 5)
        {
          CHECK4 CHECK1FINAL (0);
        }
#if __ARCH_64BIT
      LOOP CHECK8FINAL (-3);
#else
      LOOP CHECK4 CHECK1FINAL (0);
#endif
    case 6:
      if (n == 6)
        {
          CHECK4 CHECK4FINAL (-2);
        }
      LOOP CHECK8FINAL (-2);
    case 7:
      if (n == 7)
        {
          CHECK4 CHECK4FINAL (-1);
        }
      LOOP CHECK8FINAL (-1);
    }
}

int
memcmp1 (char *x, char *y)
{
  return memcmp (x, y, 1);
}
int
memcmp10 (char *x, char *y)
{
  return memcmp (x, y, 10);
}
int
memcmp20 (char *x, char *y)
{
  return memcmp (x, y, 20);
}
int
memcmp30 (char *x, char *y)
{
  return memcmp (x, y, 30);
}

int
memeq1 (char *x, char *y)
{
  return memcmp (x, y, 1) != 0;
}
int
memeq10 (char *x, char *y)
{
  return memcmp (x, y, 10) != 0;
}
int
memeq20 (char *x, char *y)
{
  return memcmp (x, y, 20) != 0;
}
int
memeq30 (char *x, char *y)
{
  return memcmp (x, y, 30) != 0;
}

Reply via email to