Hi Bruce, I think one of the loops in rte_memcpy_aligned() takes one too many rounds in the case where the catch-up could replace the last round.
Consider e.g. n = 128: The 64 bytes block copy will take two rounds, and the catch-up will copy the last 64 bytes once again. I think that the 64 bytes block copy could take only one round and let the catch-up copy the last 64 bytes. I'm not sure if my suggested method is generally faster than the current method, so I'm passing the ball. PS: It looks like something similar can be done for the other block copy loops in this file. I haven't dug into the details. static __rte_always_inline void * rte_memcpy_aligned(void *dst, const void *src, size_t n) { void *ret = dst; /* Copy size < 16 bytes */ if (n < 16) { return rte_mov15_or_less(dst, src, n); } /* Copy 16 <= size <= 32 bytes */ if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } /* Copy 32 < size <= 64 bytes */ if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n); return ret; } /* Copy 64 bytes blocks */ - for (; n >= 64; n -= 64) { + for (; n > 64; n -= 64) { rte_mov64((uint8_t *)dst, (const uint8_t *)src); dst = (uint8_t *)dst + 64; src = (const uint8_t *)src + 64; } /* Copy whatever left */ rte_mov64((uint8_t *)dst - 64 + n, (const uint8_t *)src - 64 + n); return ret; } Med venlig hilsen / Kind regards, -Morten Brørup