RE: [PATCH v5] eal: fix unaligned loads/stores in rte_memcpy_generic

Ananyev, Konstantin Fri, 04 Feb 2022 09:17:24 -0800


> Calls to rte_memcpy_generic could result in unaligned loads/stores for
> 1 < n < 16. This is undefined behavior according to the C standard,
> and it gets flagged by the clang undefined behavior sanitizer.
> 
> rte_memcpy_generic is called with unaligned src and dst addresses.
> When 1 < n < 16, the code would cast both src and dst to a qword,
> dword or word pointer, without verifying the alignment of src/dst. The
> code was changed to use a packed structure to perform the unaligned
> load/store operations. This results in unaligned load/store operations
> to be C standards-compliant.


Still not sure we need to fix that:
This is x86 specific code-path, and as I remember on x86 there are no
penalties for unaligned access to 2/4/8 byte values. 
Though I like introduction of rte_mov15_or_less() function -t helps
with code dedup. 

> 
> Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
> Cc: Xiaoyun Li <xiaoyun...@intel.com>
> Cc: sta...@dpdk.org
> 
> Signed-off-by: Luc Pelletier <lucp.at.w...@gmail.com>
> ---
> 
> Thanks to Stephen's pointer to look at the linux kernel, I was able to
> find a way to perform the unaligned load/store using pure C code. The
> new functions added to perform the load/store could likely be moved to a
> different file and the code duplication could likely be eliminated by
> using a macro. However, I will hold off on making these changes until I
> get confirmation from maintainers that this technique is acceptable and
> this is what we want to move forward with.
> 
>  lib/eal/x86/include/rte_memcpy.h | 142 +++++++++++++++++--------------
>  1 file changed, 80 insertions(+), 62 deletions(-)
> 
> diff --git a/lib/eal/x86/include/rte_memcpy.h 
> b/lib/eal/x86/include/rte_memcpy.h
> index 1b6c6e585f..4e876d39eb 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -45,6 +45,83 @@ extern "C" {
>  static __rte_always_inline void *
>  rte_memcpy(void *dst, const void *src, size_t n);
> 
> +static __rte_always_inline uint64_t
> +rte_load_unaligned_uint64(const void *ptr)
> +{
> +     struct unaligned_uint64 { uint64_t val; } __rte_packed;
> +     return ((const struct unaligned_uint64 *)ptr)->val;
> +}
> +
> +static __rte_always_inline uint32_t
> +rte_load_unaligned_uint32(const void *ptr)
> +{
> +     struct unaligned_uint32 { uint32_t val; } __rte_packed;
> +     return ((const struct unaligned_uint32 *)ptr)->val;
> +}
> +
> +static __rte_always_inline uint16_t
> +rte_load_unaligned_uint16(const void *ptr)
> +{
> +     struct unaligned_uint16 { uint16_t val; } __rte_packed;
> +     return ((const struct unaligned_uint16 *)ptr)->val;
> +}
> +
> +static __rte_always_inline void
> +rte_store_unaligned_uint64(void *ptr, uint64_t val)
> +{
> +     struct unaligned_uint64 { uint64_t val; } __rte_packed;
> +     ((struct unaligned_uint64 *)ptr)->val = val;
> +}
> +
> +static __rte_always_inline void
> +rte_store_unaligned_uint32(void *ptr, uint32_t val)
> +{
> +     struct unaligned_uint32 { uint32_t val; } __rte_packed;
> +     ((struct unaligned_uint32 *)ptr)->val = val;
> +}
> +
> +static __rte_always_inline void
> +rte_store_unaligned_uint16(void *ptr, uint16_t val)
> +{
> +     struct unaligned_uint16 { uint16_t val; } __rte_packed;
> +     ((struct unaligned_uint16 *)ptr)->val = val;
> +}
> +
> +/**
> + * Copy bytes from one location to another,
> + * locations should not overlap.
> + * Use with unaligned src/dst, and n <= 15.
> + */
> +static __rte_always_inline void *
> +rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
> +{
> +     void *ret = dst;
> +     if (n & 8) {
> +             rte_store_unaligned_uint64(
> +                     dst,
> +                     rte_load_unaligned_uint64(src));
> +             src = ((const uint64_t *)src + 1);
> +             dst = ((uint64_t *)dst + 1);
> +     }
> +     if (n & 4) {
> +             rte_store_unaligned_uint32(
> +                     dst,
> +                     rte_load_unaligned_uint32(src));
> +             src = ((const uint32_t *)src + 1);
> +             dst = ((uint32_t *)dst + 1);
> +     }
> +     if (n & 2) {
> +             rte_store_unaligned_uint16(
> +                     dst,
> +                     rte_load_unaligned_uint16(src));
> +             src = ((const uint16_t *)src + 1);
> +             dst = ((uint16_t *)dst + 1);
> +     }
> +     if (n & 1)
> +             *(uint8_t *)dst = *(const uint8_t *)src;
> +     return ret;
> +}
> +
>  #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> 
>  #define ALIGNMENT_MASK 0x3F
> @@ -171,8 +248,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t 
> n)
>  static __rte_always_inline void *
>  rte_memcpy_generic(void *dst, const void *src, size_t n)
>  {
> -     uintptr_t dstu = (uintptr_t)dst;
> -     uintptr_t srcu = (uintptr_t)src;
>       void *ret = dst;
>       size_t dstofss;
>       size_t bits;
> @@ -181,24 +256,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>        * Copy less than 16 bytes
>        */
>       if (n < 16) {
> -             if (n & 0x01) {
> -                     *(uint8_t *)dstu = *(const uint8_t *)srcu;
> -                     srcu = (uintptr_t)((const uint8_t *)srcu + 1);
> -                     dstu = (uintptr_t)((uint8_t *)dstu + 1);
> -             }
> -             if (n & 0x02) {
> -                     *(uint16_t *)dstu = *(const uint16_t *)srcu;
> -                     srcu = (uintptr_t)((const uint16_t *)srcu + 1);
> -                     dstu = (uintptr_t)((uint16_t *)dstu + 1);
> -             }
> -             if (n & 0x04) {
> -                     *(uint32_t *)dstu = *(const uint32_t *)srcu;
> -                     srcu = (uintptr_t)((const uint32_t *)srcu + 1);
> -                     dstu = (uintptr_t)((uint32_t *)dstu + 1);
> -             }
> -             if (n & 0x08)
> -                     *(uint64_t *)dstu = *(const uint64_t *)srcu;
> -             return ret;
> +             return rte_mov15_or_less_unaligned(dst, src, n);
>       }
> 
>       /**
> @@ -379,8 +437,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t 
> n)
>  static __rte_always_inline void *
>  rte_memcpy_generic(void *dst, const void *src, size_t n)
>  {
> -     uintptr_t dstu = (uintptr_t)dst;
> -     uintptr_t srcu = (uintptr_t)src;
>       void *ret = dst;
>       size_t dstofss;
>       size_t bits;
> @@ -389,25 +445,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>        * Copy less than 16 bytes
>        */
>       if (n < 16) {
> -             if (n & 0x01) {
> -                     *(uint8_t *)dstu = *(const uint8_t *)srcu;
> -                     srcu = (uintptr_t)((const uint8_t *)srcu + 1);
> -                     dstu = (uintptr_t)((uint8_t *)dstu + 1);
> -             }
> -             if (n & 0x02) {
> -                     *(uint16_t *)dstu = *(const uint16_t *)srcu;
> -                     srcu = (uintptr_t)((const uint16_t *)srcu + 1);
> -                     dstu = (uintptr_t)((uint16_t *)dstu + 1);
> -             }
> -             if (n & 0x04) {
> -                     *(uint32_t *)dstu = *(const uint32_t *)srcu;
> -                     srcu = (uintptr_t)((const uint32_t *)srcu + 1);
> -                     dstu = (uintptr_t)((uint32_t *)dstu + 1);
> -             }
> -             if (n & 0x08) {
> -                     *(uint64_t *)dstu = *(const uint64_t *)srcu;
> -             }
> -             return ret;
> +             return rte_mov15_or_less_unaligned(dst, src, n);
>       }
> 
>       /**
> @@ -672,8 +710,6 @@ static __rte_always_inline void *
>  rte_memcpy_generic(void *dst, const void *src, size_t n)
>  {
>       __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
> -     uintptr_t dstu = (uintptr_t)dst;
> -     uintptr_t srcu = (uintptr_t)src;
>       void *ret = dst;
>       size_t dstofss;
>       size_t srcofs;
> @@ -682,25 +718,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>        * Copy less than 16 bytes
>        */
>       if (n < 16) {
> -             if (n & 0x01) {
> -                     *(uint8_t *)dstu = *(const uint8_t *)srcu;
> -                     srcu = (uintptr_t)((const uint8_t *)srcu + 1);
> -                     dstu = (uintptr_t)((uint8_t *)dstu + 1);
> -             }
> -             if (n & 0x02) {
> -                     *(uint16_t *)dstu = *(const uint16_t *)srcu;
> -                     srcu = (uintptr_t)((const uint16_t *)srcu + 1);
> -                     dstu = (uintptr_t)((uint16_t *)dstu + 1);
> -             }
> -             if (n & 0x04) {
> -                     *(uint32_t *)dstu = *(const uint32_t *)srcu;
> -                     srcu = (uintptr_t)((const uint32_t *)srcu + 1);
> -                     dstu = (uintptr_t)((uint32_t *)dstu + 1);
> -             }
> -             if (n & 0x08) {
> -                     *(uint64_t *)dstu = *(const uint64_t *)srcu;
> -             }
> -             return ret;
> +             return rte_mov15_or_less_unaligned(dst, src, n);
>       }
> 
>       /**
> --
> 2.25.1

RE: [PATCH v5] eal: fix unaligned loads/stores in rte_memcpy_generic

Reply via email to