On Sun, Mar 03, 2024 at 10:46:21AM +0100, Morten Brørup wrote:
> When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> In the case where the size is known to be 16 at build tine, omit the
> duplicate copy.
> 
> Reduced the amount of effectively copy-pasted code by using #ifdef
> inside functions instead of outside functions.
> 
> Suggested-by: Stephen Hemminger <step...@networkplumber.org>
> Signed-off-by: Morten Brørup <m...@smartsharesystems.com>

Changes in general look good to me. Comments inline below.

/Bruce

> ---
> v2:
> * For GCC, version 11 is required for proper AVX handling;
>   if older GCC version, treat AVX as SSE.
>   Clang does not have this issue.
>   Note: Original code always treated AVX as SSE, regardless of compiler.
> * Do not add copyright. (Stephen Hemminger)
> ---
>  lib/eal/x86/include/rte_memcpy.h | 231 ++++++++-----------------------
>  1 file changed, 56 insertions(+), 175 deletions(-)
> 
> diff --git a/lib/eal/x86/include/rte_memcpy.h 
> b/lib/eal/x86/include/rte_memcpy.h
> index 72a92290e0..d1df841f5e 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
>       return ret;
>  }
>  
> -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> -
> -#define ALIGNMENT_MASK 0x3F
> -
> -/**
> - * AVX512 implementation below
> - */
> -
>  /**
>   * Copy 16 bytes from one location to another,
>   * locations should not overlap.
> @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov32(uint8_t *dst, const uint8_t *src)
>  {
> +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__ 
> || \
> +             (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && 
> (GCC_VERSION < 110000)))

I think we can drop the AVX512 checks here, since I'm not aware of any
system where we'd have AVX512 but not AVX2 available, so just checking for
AVX2 support should be sufficient.

On the final compiler-based check, I don't strongly object to it, but I
just wonder as to its real value. AVX2 was first introduced by Intel over 10
years ago, and (from what I find in wikipedia), it's been in AMD CPUs since
~2015. While we did have CPUs still being produced without AVX2 since that
time, they generally didn't have AVX1 either, only having SSE instructions.
Therefore the number of systems which require this additional check is
likely very small at this stage.
That said, I'm ok to either keep or omit it at your choice. If you do keep
it, how about putting the check once at the top of the file and using a
single short define instead for the multiple places it's used e.g.

#if (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)))
#define RTE_MEMCPY_AVX2
#endif


>       __m256i ymm0;
>  
>       ymm0 = _mm256_loadu_si256((const __m256i *)src);
>       _mm256_storeu_si256((__m256i *)dst, ymm0);
> +#else /* SSE implementation */
> +     rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +     rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +#endif
>  }
>  
>  /**
> @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov64(uint8_t *dst, const uint8_t *src)
>  {
> +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
>       __m512i zmm0;
>  
>       zmm0 = _mm512_loadu_si512((const void *)src);
>       _mm512_storeu_si512((void *)dst, zmm0);
> +#else /* AVX2, AVX & SSE implementation */
> +     rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> +     rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> +#endif
>  }
>  
>  /**
> @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov256(uint8_t *dst, const uint8_t *src)
>  {
> -     rte_mov64(dst + 0 * 64, src + 0 * 64);
> -     rte_mov64(dst + 1 * 64, src + 1 * 64);
> -     rte_mov64(dst + 2 * 64, src + 2 * 64);
> -     rte_mov64(dst + 3 * 64, src + 3 * 64);
> +     rte_mov128(dst + 0 * 128, src + 0 * 128);
> +     rte_mov128(dst + 1 * 128, src + 1 * 128);
>  }
>  
> +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> +
> +/**
> + * AVX512 implementation below
> + */
> +
> +#define ALIGNMENT_MASK 0x3F
> +
>  /**
>   * Copy 128-byte blocks from one location to another,
>   * locations should not overlap.
> @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>       /**
>        * Fast way when copy size doesn't exceed 512 bytes
>        */
> +     if (__builtin_constant_p(n) && n == 32) {
> +             rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +             return ret;
> +     }

There's an outstanding patchset from Stephen to replace all use of
rte_memcpy with a constant parameter with an actual call to regular memcpy.
On a wider scale should we not look to do something similar in this file,
have calls to rte_memcpy with constant parameter always turn into a call to
regular memcpy? We used to have such a macro in older DPDK e.g.
from DPDK 1.8

http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcpy.h?h=v1.8.0#n171

This would elminiate the need to put in constant_p checks all through the
code.

>       if (n <= 32) {
>               rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +             if (__builtin_constant_p(n) && n == 16)
> +                     return ret; /* avoid (harmless) duplicate copy */
>               rte_mov16((uint8_t *)dst - 16 + n,
>                                 (const uint8_t *)src - 16 + n);
>               return ret;
>       }
<snip>

Reply via email to