Hi, After trying to implement memcmp for big endian I realized that its exactly whats needed.
As on big endian we could just load them into register and directly compare them as ordering is same then for little endian we could just first use bswap for big-endian case. Following expansion should be close to optimal on architectures that have unaligned loads. It isn't worth do it without these as emulating them bloats implementation size, so its better do libcall. I was surprised when I checked how it expands pattern if (memcmp(x,y,n)) that it optimizes byteswap away, expansion of that looks like best possible. It could be improved on other platforms. First is that I ignore 16bit operations as they tend to be slow. Instead I do an overlapping load of maximal size. I could add table of expansion without overlap and see whats faster. For generic memcmp assemly could be made shorter as gcc duplicates tails. For first 8 bytes I use fact that its likely so I start with converting them, assembly of memcmp(x,y,23) is following. movq (%rdi), %rax movq (%rsi), %rdx .here: bswap %rax bswap %rdx cmpq %rdx, %rax je .L22 .L20: cmpq %rax, %rdx .L21: sbbl %eax, %eax andl $2, %eax subl $1, %eax ret .L22: movq 8(%rdi), %rax movq 8(%rsi), %rdx cmpq %rdx, %rax je .L15 bswap %rdx bswap %rax jmp .L20 ... You could save space by changing that into movq 8(%rdi), %rax movq 8(%rsi), %rdx cmpq %rdx, %rax jne .here movq 15(%rdi), %rax movq 15(%rsi), %rdx cmpq %rdx, %rax jne .here xor %rax, %rax ret Also there is bug that you duplicate comparison: cmpq %rdx, %rax je .L22 .L20: cmpq %rax, %rdx Comments? #include <string.h> #include <stdint.h> #undef memcmp #define memcmp(x, y, n) (__builtin_constant_p (n) && n < 64 ? __memcmp_inline (x, y, n) \ : memcmp (x, y, n)) #define LOAD8(x) (*((uint8_t *) (x))) #define LOAD32(x) (*((uint32_t *) (x))) #define LOAD64(x) (*((uint64_t *) (x))) #define CHECK(tp, n) #if __BYTE_ORDER == __LITTLE_ENDIAN # define SWAP32(x) __builtin_bswap32 (LOAD32 (x)) # define SWAP64(x) __builtin_bswap64 (LOAD64 (x)) #else # define SWAP32(x) LOAD32 (x) # define SWAP64(x) LOAD64 (x) #endif #define __ARCH_64BIT 1 static __always_inline int check (uint64_t x, uint64_t y) { if (x == y) return 0; if (x > y) return 1; return -1; } static __always_inline int check_nonzero (uint64_t x, uint64_t y) { if (x > y) return 1; return -1; } static __always_inline int __memcmp_inline (void *x, void *y, size_t n) { #define CHECK1 if (LOAD8 (x + i) - LOAD8 (y + i)) \ return check_nonzero (LOAD8 (x + i), LOAD8 (y + i)); i = i + 1; #define CHECK4 if (i == 0 ? SWAP32 (x + i) - SWAP32 (y + i)\ : LOAD32 (x + i) - LOAD32 (y + i)) \ return check_nonzero (SWAP32 (x + i), SWAP32 (y + i)); i = i + 4; #define CHECK8 if (i == 0 ? SWAP64 (x + i) - SWAP64 (y + i)\ : LOAD64 (x + i) - LOAD64 (y + i)) \ return check_nonzero (SWAP64 (x + i), SWAP64 (y + i)); i = i + 8; #define CHECK1FINAL(o) return check (LOAD8 (x + i + o), LOAD8 (y + i + o)); #define CHECK4FINAL(o) return check (SWAP32 (x + i + o), SWAP32 (y + i + o)); #define CHECK8FINAL(o) return check (SWAP64 (x + i + o), SWAP64 (y + i + o)); #if __ARCH_64BIT == 0 # undef CHECK8 # undef CHECK8FINAL # define CHECK8 CHECK4 CHECK4 # define CHECK8FINAL(o) CHECK4 CHECK4FINAL (o) #endif #define LOOP if (i + 8 < n) { CHECK8 } \ if (i + 8 < n) { CHECK8 } \ if (i + 8 < n) { CHECK8 } \ if (i + 8 < n) { CHECK8 } \ if (i + 8 < n) { CHECK8 } \ if (i + 8 < n) { CHECK8 } \ if (i + 8 < n) { CHECK8 } \ if (i + 8 < n) { CHECK8 } long i = 0; switch (n % 8) { case 0: if (n == 0) return 0; LOOP; CHECK8FINAL (0); case 1: LOOP CHECK1FINAL (0); case 2: if (n == 2) { CHECK1 CHECK1FINAL (0); } LOOP CHECK4FINAL (-2); case 3: if (n == 3) { CHECK1 CHECK1 CHECK1FINAL (0); } LOOP CHECK4FINAL (-1); case 4: LOOP CHECK4FINAL (0); case 5: if (n == 5) { CHECK4 CHECK1FINAL (0); } #if __ARCH_64BIT LOOP CHECK8FINAL (-3); #else LOOP CHECK4 CHECK1FINAL (0); #endif case 6: if (n == 6) { CHECK4 CHECK4FINAL (-2); } LOOP CHECK8FINAL (-2); case 7: if (n == 7) { CHECK4 CHECK4FINAL (-1); } LOOP CHECK8FINAL (-1); } } int memcmp1 (char *x, char *y) { return memcmp (x, y, 1); } int memcmp10 (char *x, char *y) { return memcmp (x, y, 10); } int memcmp20 (char *x, char *y) { return memcmp (x, y, 20); } int memcmp30 (char *x, char *y) { return memcmp (x, y, 30); } int memeq1 (char *x, char *y) { return memcmp (x, y, 1) != 0; } int memeq10 (char *x, char *y) { return memcmp (x, y, 10) != 0; } int memeq20 (char *x, char *y) { return memcmp (x, y, 20) != 0; } int memeq30 (char *x, char *y) { return memcmp (x, y, 30) != 0; }