Re: [dpdk-dev] [PATCH v3] arch/arm: optimization for memcpy on AArch64

Jerin Jacob Sun, 17 Dec 2017 23:45:13 -0800

-----Original Message-----
> Date: Mon, 18 Dec 2017 10:54:24 +0800
> From: Herbert Guan <herbert.g...@arm.com>
> To: dev@dpdk.org, jerin.ja...@caviumnetworks.com
> CC: Herbert Guan <herbert.g...@arm.com>
> Subject: [PATCH v3] arch/arm: optimization for memcpy on AArch64
> X-Mailer: git-send-email 1.8.3.1
> 
> Signed-off-by: Herbert Guan <herbert.g...@arm.com>
> ---
>  config/common_armv8a_linuxapp                      |   6 +
>  .../common/include/arch/arm/rte_memcpy_64.h        | 292 
> +++++++++++++++++++++
>  2 files changed, 298 insertions(+)
> 
> diff --git a/config/common_armv8a_linuxapp b/config/common_armv8a_linuxapp
> index 6732d1e..8f0cbed 100644
> --- a/config/common_armv8a_linuxapp
> +++ b/config/common_armv8a_linuxapp
> @@ -44,6 +44,12 @@ CONFIG_RTE_FORCE_INTRINSICS=y
>  # to address minimum DMA alignment across all arm64 implementations.
>  CONFIG_RTE_CACHE_LINE_SIZE=128
>  
> +# Accelarate rte_memcpy.  Be sure to run unit test to determine the


Additional space before "Be". Rather than just mentioning the unit test, mention
the absolute test case name(memcpy_perf_autotest)

> +# best threshold in code.  Refer to notes in source file

Additional space before "Refer"

> +# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more
> +# info.
> +CONFIG_RTE_ARCH_ARM64_MEMCPY=n
> +
>  CONFIG_RTE_LIBRTE_FM10K_PMD=n
>  CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
>  CONFIG_RTE_LIBRTE_AVP_PMD=n
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h 
> b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> index b80d8ba..1ea275d 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> @@ -42,6 +42,296 @@
>  
>  #include "generic/rte_memcpy.h"
>  
> +#ifdef RTE_ARCH_ARM64_MEMCPY

See the comment below at "(GCC_VERSION < 50400)" check

> +#include <rte_common.h>
> +#include <rte_branch_prediction.h>
> +
> +/*
> + * The memory copy performance differs on different AArch64 
> micro-architectures.
> + * And the most recent glibc (e.g. 2.23 or later) can provide a better 
> memcpy()
> + * performance compared to old glibc versions. It's always suggested to use a
> + * more recent glibc if possible, from which the entire system can get 
> benefit.
> + *
> + * This implementation improves memory copy on some aarch64 
> micro-architectures,
> + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
> + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
> + * always providing better performance than memcpy() so users need to run 
> unit
> + * test "memcpy_perf_autotest" and customize parameters in customization 
> section
> + * below for best performance.
> + *
> + * Compiler version will also impact the rte_memcpy() performance. It's 
> observed
> + * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
> + * provide better performance than GCC 4.8.5 compiled binaries.
> + */
> +
> +/**************************************
> + * Beginning of customization section
> + **************************************/
> +#define ALIGNMENT_MASK 0x0F

This symbol will be included in public rte_memcpy.h version for arm64 DPDK 
build.
Please use RTE_ prefix to avoid multi definition.(RTE_ARCH_ARM64_ALIGN_MASK ? 
or any shorter name)

> +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> +/* Only src unalignment will be treaed as unaligned copy */
> +#define IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
> +#else
> +/* Both dst and src unalignment will be treated as unaligned copy */
> +#define IS_UNALIGNED_COPY(dst, src) \
> +             (((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
> +#endif
> +
> +
> +/*
> + * If copy size is larger than threshold, memcpy() will be used.
> + * Run "memcpy_perf_autotest" to determine the proper threshold.
> + */
> +#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> +#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))

Same as above comment.

> +
> +/**************************************
> + * End of customization section
> + **************************************/
> +#ifdef RTE_TOOLCHAIN_GCC
> +#if (GCC_VERSION < 50400)
> +#warning "The GCC version is quite old, which may result in sub-optimal \
> +performance of the compiled code. It is suggested that at least GCC 5.4.0 \
> +be used."

Even though it is warning, based on where this file get included it will 
generate error(see below)
How about, selecting optimized memcpy when RTE_ARCH_ARM64_MEMCPY && if 
(GCC_VERSION >= 50400) ?

  CC eal_common_options.o
In file included from
/home/jerin/dpdk.org/build/include/rte_memcpy.h:37:0,from
/home/jerin/dpdk.org/lib/librte_eal/common/eal_common_options.c:53:
/home/jerin/dpdk.org/build/include/rte_memcpy_64.h:93:2: error: #warning
                                                        ^^^^^^^^
"The GCC version is quite old, which may result in sub-optimal
performance of the compiled code. It is suggested that at least GCC
5.4.0 be used." [-Werror=cpp]
                ^^^^^^^^^^^^^^
 #warning "The GCC version is quite old, which may result in sub-optimal
\
  ^


> +#endif
> +#endif
> +
> +
> +#if RTE_CACHE_LINE_SIZE >= 128

We can remove this conditional compilation check. ie. It can get compiled for 
both cases,
But it will be used only when RTE_CACHE_LINE_SIZE >= 128

> +static __rte_always_inline void
> +rte_memcpy_ge16_lt128
> +(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
> +{
> +     if (n < 64) {
> +             if (n == 16) {
> +                     rte_mov16(dst, src);
> +             } else if (n <= 32) {
> +                     rte_mov16(dst, src);
> +                     rte_mov16(dst - 16 + n, src - 16 + n);
> +             } else if (n <= 48) {
> +                     rte_mov32(dst, src);
> +                     rte_mov16(dst - 16 + n, src - 16 + n);
> +             } else {
> +                     rte_mov48(dst, src);
> +                     rte_mov16(dst - 16 + n, src - 16 + n);
> +             }
> +     } else {
> +             rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +             if (n > 48 + 64)
> +                     rte_mov64(dst - 64 + n, src - 64 + n);
> +             else if (n > 32 + 64)
> +                     rte_mov48(dst - 48 + n, src - 48 + n);
> +             else if (n > 16 + 64)
> +                     rte_mov32(dst - 32 + n, src - 32 + n);
> +             else if (n > 64)
> +                     rte_mov16(dst - 16 + n, src - 16 + n);
> +     }
> +}
> +
> +
> +#else

Same as above comment.

> +static __rte_always_inline void
> +rte_memcpy_ge16_lt64
> +(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
> +{
> +     if (n == 16) {
> +             rte_mov16(dst, src);
> +     } else if (n <= 32) {
> +             rte_mov16(dst, src);
> +             rte_mov16(dst - 16 + n, src - 16 + n);
> +     } else if (n <= 48) {
> +             rte_mov32(dst, src);
> +             rte_mov16(dst - 16 + n, src - 16 + n);
> +     } else {
> +             rte_mov48(dst, src);
> +             rte_mov16(dst - 16 + n, src - 16 + n);
> +     }
> +}
> +
> +
> +static __rte_always_inline void *
> +rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
> +{
> +     if (n < 16) {
> +             rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
> +             return dst;
> +     }
> +#if RTE_CACHE_LINE_SIZE >= 128
> +     if (n < 128) {
> +             rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n);
> +             return dst;
> +     }
> +#else
> +     if (n < 64) {
> +             rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
> +             return dst;
> +     }
> +#endif
> +     __builtin_prefetch(src, 0, 0);
> +     __builtin_prefetch(dst, 1, 0);
> +     if (likely(
> +               (!IS_UNALIGNED_COPY(dst, src) && n <= ALIGNED_THRESHOLD)
> +                || (IS_UNALIGNED_COPY(dst, src) && n <= UNALIGNED_THRESHOLD)
> +               )) {
> +#if RTE_CACHE_LINE_SIZE >= 128
> +             rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n);
> +#else
> +             rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
> +#endif

Can we remove this #ifdef clutter(We have two of them in a same function)?

I suggest to remove this clutter by having the separate routine. ie.
1)
#if RTE_CACHE_LINE_SIZE >= 128
rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
{
}
#else
rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
{
}
#endif

2) Have separate inline function to resolve following logic and used it 
in both variants.

        if (likely(
                  (!IS_UNALIGNED_COPY(dst, src) && n <= ALIGNED_THRESHOLD)
                   || (IS_UNALIGNED_COPY(dst, src) && n <= UNALIGNED_THRESHOLD)
                  )) {

With above changes:
Acked-by: Jerin Jacob <jerin.ja...@caviumnetworks.com>

Re: [dpdk-dev] [PATCH v3] arch/arm: optimization for memcpy on AArch64

Reply via email to