Calls to rte_memcpy_generic could result in unaligned loads/stores for 1 < n < 16. This is undefined behavior according to the C standard, and it gets flagged by the clang undefined behavior sanitizer.
rte_memcpy_generic is called with unaligned src and dst addresses. When 1 < n < 16, the code would cast both src and dst to a qword, dword or word pointer, without verifying the alignment of src/dst. The code was changed to use a for loop to copy the bytes one by one. Experimentation on compiler explorer indicates that gcc 7+ (released in 2017) and clang 7+ (released in 2018) both optimize out the for loop with the least number of memory loads and stores, if n is known at compile-time. If n is only known at compile-time, gcc and clang have different behaviour but they both seem to recognize that a memcpy is being done. More recent versions of both gcc/clang seem to also produce even more optimized results. Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time") Cc: Xiaoyun Li <xiaoyun...@intel.com> Cc: sta...@dpdk.org Signed-off-by: Luc Pelletier <lucp.at.w...@gmail.com> --- I forgot that code under x86 also needs to compile for 32-bit (obviously). So, I did some more experimentation and replaced the assembly code with a regular for loop. Explanations are in the updated commit message. Experimentation was done on compiler explorer here: https://godbolt.org/z/zK54rzPEn lib/eal/x86/include/rte_memcpy.h | 82 ++++++++------------------------ 1 file changed, 20 insertions(+), 62 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 1b6c6e585f..e422397e49 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -45,6 +45,23 @@ extern "C" { static __rte_always_inline void * rte_memcpy(void *dst, const void *src, size_t n); +/** + * Copy bytes from one location to another, + * locations should not overlap. + * Use with unaligned src/dst, and n <= 15. + */ +static __rte_always_inline void * +rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n) +{ + void *ret = dst; + for (; n; n--) { + *((char *)dst) = *((const char *) src); + dst = ((char *)dst) + 1; + src = ((const char *)src) + 1; + } + return ret; +} + #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 #define ALIGNMENT_MASK 0x3F @@ -171,8 +188,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -181,24 +196,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) - *(uint64_t *)dstu = *(const uint64_t *)srcu; - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** @@ -379,8 +377,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n) static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t bits; @@ -389,25 +385,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** @@ -672,8 +650,6 @@ static __rte_always_inline void * rte_memcpy_generic(void *dst, const void *src, size_t n) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - uintptr_t dstu = (uintptr_t)dst; - uintptr_t srcu = (uintptr_t)src; void *ret = dst; size_t dstofss; size_t srcofs; @@ -682,25 +658,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * Copy less than 16 bytes */ if (n < 16) { - if (n & 0x01) { - *(uint8_t *)dstu = *(const uint8_t *)srcu; - srcu = (uintptr_t)((const uint8_t *)srcu + 1); - dstu = (uintptr_t)((uint8_t *)dstu + 1); - } - if (n & 0x02) { - *(uint16_t *)dstu = *(const uint16_t *)srcu; - srcu = (uintptr_t)((const uint16_t *)srcu + 1); - dstu = (uintptr_t)((uint16_t *)dstu + 1); - } - if (n & 0x04) { - *(uint32_t *)dstu = *(const uint32_t *)srcu; - srcu = (uintptr_t)((const uint32_t *)srcu + 1); - dstu = (uintptr_t)((uint32_t *)dstu + 1); - } - if (n & 0x08) { - *(uint64_t *)dstu = *(const uint64_t *)srcu; - } - return ret; + return rte_mov15_or_less_unaligned(dst, src, n); } /** -- 2.25.1