[PATCH v2] eal: fix unaligned loads/stores in rte_memcpy_generic

Luc Pelletier Sat, 15 Jan 2022 13:40:54 -0800

Calls to rte_memcpy_generic could result in unaligned loads/stores for
1 < n < 16. This is undefined behavior according to the C standard,
and it gets flagged by the clang undefined behavior sanitizer.


rte_memcpy_generic is called with unaligned src and dst addresses.
When 1 < n < 16, the code would cast both src and dst to a qword,
dword or word pointer, without verifying the alignment of src/dst. The
code was changed to use a for loop to copy the bytes one by one.
Experimentation on compiler explorer indicates that gcc 7+
(released in 2017) and clang 7+ (released in 2018) both optimize out the
for loop with the least number of memory loads and stores, if n is known
at compile-time. If n is only known at compile-time, gcc and clang have
different behaviour but they both seem to recognize that a memcpy is
being done. More recent versions of both gcc/clang seem to also produce
even more optimized results.

Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
Cc: Xiaoyun Li <[email protected]>
Cc: [email protected]

Signed-off-by: Luc Pelletier <[email protected]>
---

I forgot that code under x86 also needs to compile for 32-bit
(obviously). So, I did some more experimentation and replaced the
assembly code with a regular for loop. Explanations are in the updated
commit message. Experimentation was done on compiler explorer here:
https://godbolt.org/z/zK54rzPEn

 lib/eal/x86/include/rte_memcpy.h | 82 ++++++++------------------------
 1 file changed, 20 insertions(+), 62 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 1b6c6e585f..e422397e49 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -45,6 +45,23 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with unaligned src/dst, and n <= 15.
+ */
+static __rte_always_inline void *
+rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
+{
+       void *ret = dst;
+       for (; n; n--) {
+               *((char *)dst) = *((const char *) src);
+               dst = ((char *)dst) + 1;
+               src = ((const char *)src) + 1;
+       }
+       return ret;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 #define ALIGNMENT_MASK 0x3F
@@ -171,8 +188,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-       uintptr_t dstu = (uintptr_t)dst;
-       uintptr_t srcu = (uintptr_t)src;
        void *ret = dst;
        size_t dstofss;
        size_t bits;
@@ -181,24 +196,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
         * Copy less than 16 bytes
         */
        if (n < 16) {
-               if (n & 0x01) {
-                       *(uint8_t *)dstu = *(const uint8_t *)srcu;
-                       srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-                       dstu = (uintptr_t)((uint8_t *)dstu + 1);
-               }
-               if (n & 0x02) {
-                       *(uint16_t *)dstu = *(const uint16_t *)srcu;
-                       srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-                       dstu = (uintptr_t)((uint16_t *)dstu + 1);
-               }
-               if (n & 0x04) {
-                       *(uint32_t *)dstu = *(const uint32_t *)srcu;
-                       srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-                       dstu = (uintptr_t)((uint32_t *)dstu + 1);
-               }
-               if (n & 0x08)
-                       *(uint64_t *)dstu = *(const uint64_t *)srcu;
-               return ret;
+               return rte_mov15_or_less_unaligned(dst, src, n);
        }
 
        /**
@@ -379,8 +377,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-       uintptr_t dstu = (uintptr_t)dst;
-       uintptr_t srcu = (uintptr_t)src;
        void *ret = dst;
        size_t dstofss;
        size_t bits;
@@ -389,25 +385,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
         * Copy less than 16 bytes
         */
        if (n < 16) {
-               if (n & 0x01) {
-                       *(uint8_t *)dstu = *(const uint8_t *)srcu;
-                       srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-                       dstu = (uintptr_t)((uint8_t *)dstu + 1);
-               }
-               if (n & 0x02) {
-                       *(uint16_t *)dstu = *(const uint16_t *)srcu;
-                       srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-                       dstu = (uintptr_t)((uint16_t *)dstu + 1);
-               }
-               if (n & 0x04) {
-                       *(uint32_t *)dstu = *(const uint32_t *)srcu;
-                       srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-                       dstu = (uintptr_t)((uint32_t *)dstu + 1);
-               }
-               if (n & 0x08) {
-                       *(uint64_t *)dstu = *(const uint64_t *)srcu;
-               }
-               return ret;
+               return rte_mov15_or_less_unaligned(dst, src, n);
        }
 
        /**
@@ -672,8 +650,6 @@ static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
        __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-       uintptr_t dstu = (uintptr_t)dst;
-       uintptr_t srcu = (uintptr_t)src;
        void *ret = dst;
        size_t dstofss;
        size_t srcofs;
@@ -682,25 +658,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
         * Copy less than 16 bytes
         */
        if (n < 16) {
-               if (n & 0x01) {
-                       *(uint8_t *)dstu = *(const uint8_t *)srcu;
-                       srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-                       dstu = (uintptr_t)((uint8_t *)dstu + 1);
-               }
-               if (n & 0x02) {
-                       *(uint16_t *)dstu = *(const uint16_t *)srcu;
-                       srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-                       dstu = (uintptr_t)((uint16_t *)dstu + 1);
-               }
-               if (n & 0x04) {
-                       *(uint32_t *)dstu = *(const uint32_t *)srcu;
-                       srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-                       dstu = (uintptr_t)((uint32_t *)dstu + 1);
-               }
-               if (n & 0x08) {
-                       *(uint64_t *)dstu = *(const uint64_t *)srcu;
-               }
-               return ret;
+               return rte_mov15_or_less_unaligned(dst, src, n);
        }
 
        /**
-- 
2.25.1

[PATCH v2] eal: fix unaligned loads/stores in rte_memcpy_generic

Reply via email to