-----Original Message----- > Date: Mon, 18 Dec 2017 10:54:24 +0800 > From: Herbert Guan <herbert.g...@arm.com> > To: dev@dpdk.org, jerin.ja...@caviumnetworks.com > CC: Herbert Guan <herbert.g...@arm.com> > Subject: [PATCH v3] arch/arm: optimization for memcpy on AArch64 > X-Mailer: git-send-email 1.8.3.1 > > Signed-off-by: Herbert Guan <herbert.g...@arm.com> > --- > config/common_armv8a_linuxapp | 6 + > .../common/include/arch/arm/rte_memcpy_64.h | 292 > +++++++++++++++++++++ > 2 files changed, 298 insertions(+) > > diff --git a/config/common_armv8a_linuxapp b/config/common_armv8a_linuxapp > index 6732d1e..8f0cbed 100644 > --- a/config/common_armv8a_linuxapp > +++ b/config/common_armv8a_linuxapp > @@ -44,6 +44,12 @@ CONFIG_RTE_FORCE_INTRINSICS=y > # to address minimum DMA alignment across all arm64 implementations. > CONFIG_RTE_CACHE_LINE_SIZE=128 > > +# Accelarate rte_memcpy. Be sure to run unit test to determine the
Additional space before "Be". Rather than just mentioning the unit test, mention the absolute test case name(memcpy_perf_autotest) > +# best threshold in code. Refer to notes in source file Additional space before "Refer" > +# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more > +# info. > +CONFIG_RTE_ARCH_ARM64_MEMCPY=n > + > CONFIG_RTE_LIBRTE_FM10K_PMD=n > CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n > CONFIG_RTE_LIBRTE_AVP_PMD=n > diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h > b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h > index b80d8ba..1ea275d 100644 > --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h > +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h > @@ -42,6 +42,296 @@ > > #include "generic/rte_memcpy.h" > > +#ifdef RTE_ARCH_ARM64_MEMCPY See the comment below at "(GCC_VERSION < 50400)" check > +#include <rte_common.h> > +#include <rte_branch_prediction.h> > + > +/* > + * The memory copy performance differs on different AArch64 > micro-architectures. > + * And the most recent glibc (e.g. 2.23 or later) can provide a better > memcpy() > + * performance compared to old glibc versions. It's always suggested to use a > + * more recent glibc if possible, from which the entire system can get > benefit. > + * > + * This implementation improves memory copy on some aarch64 > micro-architectures, > + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by > + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not > + * always providing better performance than memcpy() so users need to run > unit > + * test "memcpy_perf_autotest" and customize parameters in customization > section > + * below for best performance. > + * > + * Compiler version will also impact the rte_memcpy() performance. It's > observed > + * on some platforms and with the same code, GCC 7.2.0 compiled binaries can > + * provide better performance than GCC 4.8.5 compiled binaries. > + */ > + > +/************************************** > + * Beginning of customization section > + **************************************/ > +#define ALIGNMENT_MASK 0x0F This symbol will be included in public rte_memcpy.h version for arm64 DPDK build. Please use RTE_ prefix to avoid multi definition.(RTE_ARCH_ARM64_ALIGN_MASK ? or any shorter name) > +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN > +/* Only src unalignment will be treaed as unaligned copy */ > +#define IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK) > +#else > +/* Both dst and src unalignment will be treated as unaligned copy */ > +#define IS_UNALIGNED_COPY(dst, src) \ > + (((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK) > +#endif > + > + > +/* > + * If copy size is larger than threshold, memcpy() will be used. > + * Run "memcpy_perf_autotest" to determine the proper threshold. > + */ > +#define ALIGNED_THRESHOLD ((size_t)(0xffffffff)) > +#define UNALIGNED_THRESHOLD ((size_t)(0xffffffff)) Same as above comment. > + > +/************************************** > + * End of customization section > + **************************************/ > +#ifdef RTE_TOOLCHAIN_GCC > +#if (GCC_VERSION < 50400) > +#warning "The GCC version is quite old, which may result in sub-optimal \ > +performance of the compiled code. It is suggested that at least GCC 5.4.0 \ > +be used." Even though it is warning, based on where this file get included it will generate error(see below) How about, selecting optimized memcpy when RTE_ARCH_ARM64_MEMCPY && if (GCC_VERSION >= 50400) ? CC eal_common_options.o In file included from /home/jerin/dpdk.org/build/include/rte_memcpy.h:37:0,from /home/jerin/dpdk.org/lib/librte_eal/common/eal_common_options.c:53: /home/jerin/dpdk.org/build/include/rte_memcpy_64.h:93:2: error: #warning ^^^^^^^^ "The GCC version is quite old, which may result in sub-optimal performance of the compiled code. It is suggested that at least GCC 5.4.0 be used." [-Werror=cpp] ^^^^^^^^^^^^^^ #warning "The GCC version is quite old, which may result in sub-optimal \ ^ > +#endif > +#endif > + > + > +#if RTE_CACHE_LINE_SIZE >= 128 We can remove this conditional compilation check. ie. It can get compiled for both cases, But it will be used only when RTE_CACHE_LINE_SIZE >= 128 > +static __rte_always_inline void > +rte_memcpy_ge16_lt128 > +(uint8_t *restrict dst, const uint8_t *restrict src, size_t n) > +{ > + if (n < 64) { > + if (n == 16) { > + rte_mov16(dst, src); > + } else if (n <= 32) { > + rte_mov16(dst, src); > + rte_mov16(dst - 16 + n, src - 16 + n); > + } else if (n <= 48) { > + rte_mov32(dst, src); > + rte_mov16(dst - 16 + n, src - 16 + n); > + } else { > + rte_mov48(dst, src); > + rte_mov16(dst - 16 + n, src - 16 + n); > + } > + } else { > + rte_mov64((uint8_t *)dst, (const uint8_t *)src); > + if (n > 48 + 64) > + rte_mov64(dst - 64 + n, src - 64 + n); > + else if (n > 32 + 64) > + rte_mov48(dst - 48 + n, src - 48 + n); > + else if (n > 16 + 64) > + rte_mov32(dst - 32 + n, src - 32 + n); > + else if (n > 64) > + rte_mov16(dst - 16 + n, src - 16 + n); > + } > +} > + > + > +#else Same as above comment. > +static __rte_always_inline void > +rte_memcpy_ge16_lt64 > +(uint8_t *restrict dst, const uint8_t *restrict src, size_t n) > +{ > + if (n == 16) { > + rte_mov16(dst, src); > + } else if (n <= 32) { > + rte_mov16(dst, src); > + rte_mov16(dst - 16 + n, src - 16 + n); > + } else if (n <= 48) { > + rte_mov32(dst, src); > + rte_mov16(dst - 16 + n, src - 16 + n); > + } else { > + rte_mov48(dst, src); > + rte_mov16(dst - 16 + n, src - 16 + n); > + } > +} > + > + > +static __rte_always_inline void * > +rte_memcpy(void *restrict dst, const void *restrict src, size_t n) > +{ > + if (n < 16) { > + rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n); > + return dst; > + } > +#if RTE_CACHE_LINE_SIZE >= 128 > + if (n < 128) { > + rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n); > + return dst; > + } > +#else > + if (n < 64) { > + rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n); > + return dst; > + } > +#endif > + __builtin_prefetch(src, 0, 0); > + __builtin_prefetch(dst, 1, 0); > + if (likely( > + (!IS_UNALIGNED_COPY(dst, src) && n <= ALIGNED_THRESHOLD) > + || (IS_UNALIGNED_COPY(dst, src) && n <= UNALIGNED_THRESHOLD) > + )) { > +#if RTE_CACHE_LINE_SIZE >= 128 > + rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n); > +#else > + rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n); > +#endif Can we remove this #ifdef clutter(We have two of them in a same function)? I suggest to remove this clutter by having the separate routine. ie. 1) #if RTE_CACHE_LINE_SIZE >= 128 rte_memcpy(void *restrict dst, const void *restrict src, size_t n) { } #else rte_memcpy(void *restrict dst, const void *restrict src, size_t n) { } #endif 2) Have separate inline function to resolve following logic and used it in both variants. if (likely( (!IS_UNALIGNED_COPY(dst, src) && n <= ALIGNED_THRESHOLD) || (IS_UNALIGNED_COPY(dst, src) && n <= UNALIGNED_THRESHOLD) )) { With above changes: Acked-by: Jerin Jacob <jerin.ja...@caviumnetworks.com>