PING for review. The CI failures can be ignored: Most of the CI doesn't support the Depends-on tag, and this patch uses __rte_constant(), provided by Tyler's patch series [1].
[1]: https://inbox.dpdk.org/dev/1710970416-27841-1-git-send-email-roret...@linux.microsoft.com/ -Morten > From: Morten Brørup [mailto:m...@smartsharesystems.com] > Sent: Thursday, 30 May 2024 17.41 > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > In the case where the size is known to be 16 at build time, omit the > duplicate copy. > > Reduced the amount of effectively copy-pasted code by using #ifdef > inside functions instead of outside functions. > > Suggested-by: Stephen Hemminger <step...@networkplumber.org> > Signed-off-by: Morten Brørup <m...@smartsharesystems.com> > Acked-by: Bruce Richardson <bruce.richard...@intel.com> > --- > Depends-on: series-31578 ("provide toolchain abstracted __builtin_constant_p") > > v8: > * Keep trying to fix that CI does not understand the dependency... > Depend on series instead of patch. Github only understands series. > * Fix typo in patch description. > v7: > * Keep trying to fix that CI does not understand the dependency... > Depend on patch instead of series. > Move dependency out of the patch description itself, and down to the > version log. > v6: > * Trying to fix CI not understanding dependency... > Don't wrap dependency line. > v5: > * Fix for building with MSVC: > Use __rte_constant() instead of __builtin_constant_p(). > Add dependency on patch providing __rte_constant(). > v4: > * There are no problems compiling AVX2, only AVX. (Bruce Richardson) > v3: > * AVX2 is a superset of AVX; > for a block of AVX code, testing for AVX suffices. (Bruce Richardson) > * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the > check for older GCC version. (Bruce Richardson) > v2: > * For GCC, version 11 is required for proper AVX handling; > if older GCC version, treat AVX as SSE. > Clang does not have this issue. > Note: Original code always treated AVX as SSE, regardless of compiler. > * Do not add copyright. (Stephen Hemminger) > --- > lib/eal/x86/include/rte_memcpy.h | 239 +++++++++---------------------- > 1 file changed, 64 insertions(+), 175 deletions(-) > > diff --git a/lib/eal/x86/include/rte_memcpy.h > b/lib/eal/x86/include/rte_memcpy.h > index 72a92290e0..1619a8f296 100644 > --- a/lib/eal/x86/include/rte_memcpy.h > +++ b/lib/eal/x86/include/rte_memcpy.h > @@ -27,6 +27,16 @@ extern "C" { > #pragma GCC diagnostic ignored "-Wstringop-overflow" > #endif > > +/* > + * GCC older than version 11 doesn't compile AVX properly, so use SSE > instead. > + * There are no problems with AVX2. > + */ > +#if defined __AVX2__ > +#define RTE_MEMCPY_AVX > +#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < > 110000)) > +#define RTE_MEMCPY_AVX > +#endif > + > /** > * Copy bytes from one location to another. The locations must not overlap. > * > @@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) > return ret; > } > > -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > - > -#define ALIGNMENT_MASK 0x3F > - > -/** > - * AVX512 implementation below > - */ > - > /** > * Copy 16 bytes from one location to another, > * locations should not overlap. > @@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src) > static __rte_always_inline void > rte_mov32(uint8_t *dst, const uint8_t *src) > { > +#if defined RTE_MEMCPY_AVX > __m256i ymm0; > > ymm0 = _mm256_loadu_si256((const __m256i *)src); > _mm256_storeu_si256((__m256i *)dst, ymm0); > +#else /* SSE implementation */ > + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > +#endif > } > > /** > @@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) > static __rte_always_inline void > rte_mov64(uint8_t *dst, const uint8_t *src) > { > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > __m512i zmm0; > > zmm0 = _mm512_loadu_si512((const void *)src); > _mm512_storeu_si512((void *)dst, zmm0); > +#else /* AVX2, AVX & SSE implementation */ > + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > +#endif > } > > /** > @@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) > static __rte_always_inline void > rte_mov256(uint8_t *dst, const uint8_t *src) > { > - rte_mov64(dst + 0 * 64, src + 0 * 64); > - rte_mov64(dst + 1 * 64, src + 1 * 64); > - rte_mov64(dst + 2 * 64, src + 2 * 64); > - rte_mov64(dst + 3 * 64, src + 3 * 64); > + rte_mov128(dst + 0 * 128, src + 0 * 128); > + rte_mov128(dst + 1 * 128, src + 1 * 128); > } > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > + > +/** > + * AVX512 implementation below > + */ > + > +#define ALIGNMENT_MASK 0x3F > + > /** > * Copy 128-byte blocks from one location to another, > * locations should not overlap. > @@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > /** > * Fast way when copy size doesn't exceed 512 bytes > */ > + if (__rte_constant(n) && n == 32) { > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > + return ret; > + } > if (n <= 32) { > rte_mov16((uint8_t *)dst, (const uint8_t *)src); > + if (__rte_constant(n) && n == 16) > + return ret; /* avoid (harmless) duplicate copy */ > rte_mov16((uint8_t *)dst - 16 + n, > (const uint8_t *)src - 16 + n); > return ret; > } > + if (__rte_constant(n) && n == 64) { > + rte_mov64((uint8_t *)dst, (const uint8_t *)src); > + return ret; > + } > if (n <= 64) { > rte_mov32((uint8_t *)dst, (const uint8_t *)src); > rte_mov32((uint8_t *)dst - 32 + n, > @@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > goto COPY_BLOCK_128_BACK63; > } > > -#elif defined __AVX2__ > - > -#define ALIGNMENT_MASK 0x1F > - > -/** > - * AVX2 implementation below > - */ > - > -/** > - * Copy 16 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov16(uint8_t *dst, const uint8_t *src) > -{ > - __m128i xmm0; > - > - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); > - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); > -} > - > -/** > - * Copy 32 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov32(uint8_t *dst, const uint8_t *src) > -{ > - __m256i ymm0; > - > - ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); > - _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); > -} > +#elif defined RTE_MEMCPY_AVX > > /** > - * Copy 64 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov64(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > -} > - > -/** > - * Copy 128 bytes from one location to another, > - * locations should not overlap. > + * AVX implementation below > */ > -static __rte_always_inline void > -rte_mov128(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); > - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); > -} > > -/** > - * Copy 256 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov256(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); > - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); > - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); > - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); > - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); > - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); > -} > +#define ALIGNMENT_MASK 0x1F > > /** > * Copy 128-byte blocks from one location to another, > @@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > /** > * Fast way when copy size doesn't exceed 256 bytes > */ > - if (n <= 32) { > - rte_mov16((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst - 16 + n, > - (const uint8_t *)src - 16 + n); > + if (__rte_constant(n) && n == 32) { > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > return ret; > } > - if (n <= 48) { > + if (n <= 32) { > rte_mov16((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); > + if (__rte_constant(n) && n == 16) > + return ret; /* avoid (harmless) duplicate copy */ > rte_mov16((uint8_t *)dst - 16 + n, > (const uint8_t *)src - 16 + n); > return ret; > @@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > > #else /* __AVX512F__ */ > > -#define ALIGNMENT_MASK 0x0F > - > -/** > - * SSE & AVX implementation below > - */ > - > -/** > - * Copy 16 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov16(uint8_t *dst, const uint8_t *src) > -{ > - __m128i xmm0; > - > - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); > - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); > -} > - > -/** > - * Copy 32 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov32(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > -} > - > /** > - * Copy 64 bytes from one location to another, > - * locations should not overlap. > + * SSE implementation below > */ > -static __rte_always_inline void > -rte_mov64(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); > - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); > -} > > -/** > - * Copy 128 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov128(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); > - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); > - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); > - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); > - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); > - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); > -} > - > -/** > - * Copy 256 bytes from one location to another, > - * locations should not overlap. > - */ > -static inline void > -rte_mov256(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); > - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); > - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); > - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); > - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); > - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); > - rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); > - rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); > - rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); > - rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); > - rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); > - rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); > - rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); > - rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); > -} > +#define ALIGNMENT_MASK 0x0F > > /** > * Macro for copying unaligned block from one location to another with > constant load offset, > @@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > */ > if (n <= 32) { > rte_mov16((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + > n); > - return ret; > - } > - if (n <= 48) { > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > + if (__rte_constant(n) && n == 16) > + return ret; /* avoid (harmless) duplicate copy */ > rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + > n); > return ret; > } > if (n <= 64) { > rte_mov32((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); > + if (n > 48) > + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + > 32); > rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + > n); > return ret; > } > @@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) > } > > /* Copy 16 <= size <= 32 bytes */ > + if (__rte_constant(n) && n == 32) { > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > + return ret; > + } > if (n <= 32) { > rte_mov16((uint8_t *)dst, (const uint8_t *)src); > + if (__rte_constant(n) && n == 16) > + return ret; /* avoid (harmless) duplicate copy */ > rte_mov16((uint8_t *)dst - 16 + n, > (const uint8_t *)src - 16 + n); > > @@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) > } > > /* Copy 32 < size <= 64 bytes */ > + if (__rte_constant(n) && n == 64) { > + rte_mov64((uint8_t *)dst, (const uint8_t *)src); > + return ret; > + } > if (n <= 64) { > rte_mov32((uint8_t *)dst, (const uint8_t *)src); > rte_mov32((uint8_t *)dst - 32 + n, > -- > 2.17.1