On Thu, Jan 22, 2015 at 4:58 AM, Steve Capper <steve.cap...@linaro.org> wrote:
> Hi Yazen,
>
> I have a few comments below.
>
> Cheers,
> --
> Steve
>
>
> On Wed, Jan 21, 2015 at 02:17:47PM -0600, Yazen Ghannam wrote:
>> Tested on AMD Seattle.
>> Passes all crc32c unit tests.
>> ~4x performance increase versus sctp.
>
> Was that perf result from the unit test?
>
Yes, this was from the unit test.

> Also describe the reason for the patch in the commit log; we have an
> optional CRC instruction that can be used instead of the Ceph in-built
> table lookup. This patch uses the CRC instruction where available...
>
> Unit tests are added and they show a ...
>
> Probably also mention that this based off Ed Nevill's Hadoop patch.
>
Will do.

>>
>> Signed-off-by: Yazen Ghannam <yazen.ghan...@linaro.org>
>> ---
>>  configure.ac                   |  1 +
>>  m4/ax_arm.m4                   | 18 +++++++++++++---
>>  src/arch/arm.c                 |  2 ++
>>  src/arch/arm.h                 |  1 +
>>  src/common/Makefile.am         | 11 ++++++++--
>>  src/common/crc32c.cc           |  6 ++++++
>>  src/common/crc32c_arm64.c      | 48 
>> ++++++++++++++++++++++++++++++++++++++++++
>>  src/common/crc32c_arm64.h      | 25 ++++++++++++++++++++++
>>  src/test/common/test_crc32c.cc |  9 ++++++++
>>  9 files changed, 116 insertions(+), 5 deletions(-)
>>  create mode 100644 src/common/crc32c_arm64.c
>>  create mode 100644 src/common/crc32c_arm64.h
>>
>> diff --git a/configure.ac b/configure.ac
>> index d836b02..60e4feb 100644
>> --- a/configure.ac
>> +++ b/configure.ac
>> @@ -575,6 +575,7 @@ AC_LANG_POP([C++])
>>  # Find supported SIMD / NEON / SSE extensions supported by the compiler
>>  AX_ARM_FEATURES()
>>  AM_CONDITIONAL(HAVE_NEON, [ test "x$ax_cv_support_neon_ext" = "xyes"])
>> +AM_CONDITIONAL(HAVE_ARMV8_CRC, [ test "x$ax_cv_support_crc_ext" = "xyes"])
>>  AX_INTEL_FEATURES()
>>  AM_CONDITIONAL(HAVE_SSSE3, [ test "x$ax_cv_support_ssse3_ext" = "xyes"])
>>  AM_CONDITIONAL(HAVE_SSE4_PCLMUL, [ test "x$ax_cv_support_pclmuldq_ext" = 
>> "xyes"])
>> diff --git a/m4/ax_arm.m4 b/m4/ax_arm.m4
>> index 2ccc9a9..4ef8081 100644
>> --- a/m4/ax_arm.m4
>> +++ b/m4/ax_arm.m4
>> @@ -13,13 +13,25 @@ AC_DEFUN([AX_ARM_FEATURES],
>>        fi
>>      ;;
>>      aarch64*)
>> +      AX_CHECK_COMPILE_FLAG(-march=armv8-a, ax_cv_support_armv8=yes, [])
>> +      if test x"$ax_cv_support_armv8" = x"yes"; then
>> +        ARM_ARCH_FLAGS="-march=armv8-a"
>> +        ARM_DEFINE_FLAGS="-DARCH_AARCH64"
>> +      fi
>>        AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, 
>> ax_cv_support_neon_ext=yes, [])
>>        if test x"$ax_cv_support_neon_ext" = x"yes"; then
>> -        ARM_NEON_FLAGS="-march=armv8-a+simd -DARCH_AARCH64 -DARM_NEON"
>> -        AC_SUBST(ARM_NEON_FLAGS)
>> -        ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS"
>> +        ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+simd"
>> +        ARM_DEFINE_FLAGS="$ARM_DEFINE_FLAGS -DARM_NEON"
>>          AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
>>        fi
>> +      AX_CHECK_COMPILE_FLAG(-march=armv8-a+crc, ax_cv_support_crc_ext=yes, 
>> [])
>> +      if test x"$ax_cv_support_crc_ext" = x"yes"; then
>> +        ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+crc"
>> +        AC_DEFINE(HAVE_ARMV8_CRC,,[Support ARMv8 CRC instructions])
>> +      fi
>> +        ARM_NEON_FLAGS="$ARM_ARCH_FLAGS $ARM_DEFINE_FLAGS"
>> +        AC_SUBST(ARM_NEON_FLAGS)
>> +        ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS"
>
> So if the compiler supports CRC, then ARM_FLAGS will be set with CRC
> compile options. This will also affect the jerasure codegen (which
> doesn't check for the CRC HWCAP). Although this is unlikely to cause
> problems, I would feel more comfortable with something like:
> ARM_CRC_FLAGS
> That way you disambiguate between NEON (for jerasure) and CRC.
>
It looks to me that jerasure only uses ARM_NEON_FLAGS. What I'll do is
have minimal flags for each feature (e.g. NEON and CRC) and put all
the available flags in ARM_FLAGS.

>>      ;;
>>    esac
>>
>> diff --git a/src/arch/arm.c b/src/arch/arm.c
>> index 93d079a..2ea97eb 100644
>> --- a/src/arch/arm.c
>> +++ b/src/arch/arm.c
>> @@ -2,6 +2,7 @@
>>
>>  /* flags we export */
>>  int ceph_arch_neon = 0;
>> +int ceph_arch_arm64_crc32 = 0;
>
> <bikeshedding-comment>
> Can we rename this variable: ceph_arch_aarch64_crc32 ?
> </bikeshedding-comment>
>
Will do.

>>
>>  #include <stdio.h>
>>
>> @@ -47,6 +48,7 @@ int ceph_arch_arm_probe(void)
>>       ceph_arch_neon = (get_hwcap() & HWCAP_NEON) == HWCAP_NEON;
>>  #elif __aarch64__ && __linux__
>>       ceph_arch_neon = (get_hwcap() & HWCAP_ASIMD) == HWCAP_ASIMD;
>> +     ceph_arch_arm64_crc32 = (get_hwcap() & HWCAP_CRC32) == HWCAP_CRC32;
>>  #else
>>       if (0)
>>               get_hwcap();  // make compiler shut up
>> diff --git a/src/arch/arm.h b/src/arch/arm.h
>> index f613438..4ac716a 100644
>> --- a/src/arch/arm.h
>> +++ b/src/arch/arm.h
>> @@ -6,6 +6,7 @@ extern "C" {
>>  #endif
>>
>>  extern int ceph_arch_neon;  /* true if we have ARM NEON or ASIMD abilities 
>> */
>> +extern int ceph_arch_arm64_crc32;  /* true if we have Aarch64 CRC32/CRC32C 
>> abilities */
>
> "AArch64", we have two capital A's.
>
Ack.

>>
>>  extern int ceph_arch_arm_probe(void);
>>
>> diff --git a/src/common/Makefile.am b/src/common/Makefile.am
>> index 2888194..4c36216 100644
>> --- a/src/common/Makefile.am
>> +++ b/src/common/Makefile.am
>> @@ -103,12 +103,18 @@ libcommon_crc_la_SOURCES = \
>>       common/sctp_crc32.c \
>>       common/crc32c.cc \
>>       common/crc32c_intel_baseline.c \
>> -     common/crc32c_intel_fast.c
>> +     common/crc32c_intel_fast.c \
>> +     common/crc32c_arm64.c
>
> I would recommend crc32c_aarch64.c here too.
>
Ack.

>>
>>  if WITH_GOOD_YASM_ELF64
>>  libcommon_crc_la_SOURCES += common/crc32c_intel_fast_asm.S 
>> common/crc32c_intel_fast_zero_asm.S
>>  libcommon_crc_la_LIBTOOLFLAGS = --tag=CC
>>  endif
>> +
>> +if HAVE_ARMV8_CRC
>> +libcommon_crc_la_CFLAGS = $(AM_CFLAGS) $(ARM_FLAGS)
>> +endif
>> +
>
> The problem with this is that all the files in the
> libcommon_crc_la_SOURCES will be compiled with this flag. We want to
> limit the scope to just the ARM CRC file.
>
I don't understand the issue here. Can you please explain? There is a
check for HAVE_ARMV8_CRC which is only set if these flags are valid.
This seems to work fine on aarch64 and x86.

>>  LIBCOMMON_DEPS += libcommon_crc.la
>>  noinst_LTLIBRARIES += libcommon_crc.la
>>
>> @@ -116,7 +122,8 @@ noinst_HEADERS += \
>>       common/bloom_filter.hpp \
>>       common/sctp_crc32.h \
>>       common/crc32c_intel_baseline.h \
>> -     common/crc32c_intel_fast.h
>> +     common/crc32c_intel_fast.h \
>> +     common/crc32c_arm64.h
>>
>>
>>  # important; libmsg before libauth!
>> diff --git a/src/common/crc32c.cc b/src/common/crc32c.cc
>> index e2e81a4..540d431 100644
>> --- a/src/common/crc32c.cc
>> +++ b/src/common/crc32c.cc
>> @@ -5,9 +5,11 @@
>>
>>  #include "arch/probe.h"
>>  #include "arch/intel.h"
>> +#include "arch/arm.h"
>>  #include "common/sctp_crc32.h"
>>  #include "common/crc32c_intel_baseline.h"
>>  #include "common/crc32c_intel_fast.h"
>> +#include "common/crc32c_arm64.h"
>
> Does pulling these ARM header files in affect how this compiles for
> other architectures? Could you please give this a quick build on
> x86?
>
Builds fine on x86. I followed the same format as the intel files
where arch-specific stuff is #ifdef'd out. See next comment.

>>
>>  /*
>>   * choose best implementation based on the CPU architecture.
>> @@ -24,6 +26,10 @@ ceph_crc32c_func_t ceph_choose_crc32(void)
>>      return ceph_crc32c_intel_fast;
>>    }
>>
>> +  if (ceph_arch_arm64_crc32){
>> +    return ceph_crc32c_arm64;
>> +  }
>> +
>>    // default
>>    return ceph_crc32c_sctp;
>>  }
>> diff --git a/src/common/crc32c_arm64.c b/src/common/crc32c_arm64.c
>> new file mode 100644
>> index 0000000..6a754fb
>> --- /dev/null
>> +++ b/src/common/crc32c_arm64.c
>> @@ -0,0 +1,48 @@
>> +#include "acconfig.h"
>> +#include "include/int_types.h"
>> +#include "common/crc32c_arm64.h"
>> +
>> +#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], 
>> %x[v]":[c]"+r"(crc):[v]"r"(value))
>> +#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], 
>> %w[v]":[c]"+r"(crc):[v]"r"(value))
>> +#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], 
>> %w[v]":[c]"+r"(crc):[v]"r"(value))
>> +#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], 
>> %w[v]":[c]"+r"(crc):[v]"r"(value))
>> +
>
> This won't likely build for other architectures.
> I would recommend something like:
>
> #ifdef __aarch64__
> CRC routines
> #else
> empty functions
> #endif /* __arch64__ */
>
Yes, you are correct. I will add a check here. The empty function is
defined in the header file.

>> +uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const *buffer, 
>> unsigned len)
>> +{
>> +     int64_t length = len;
>> +
>> +     if (!buffer) {
>> +
>> +             while ((length -= sizeof(uint64_t)) >= 0)
>> +                     CRC32CX(crc, 0);
>> +
>> +             /* The following is more efficient than the straight loop */
>> +             if (length & sizeof(uint32_t))
>> +                     CRC32CW(crc, 0);
>> +
>> +             if (length & sizeof(uint16_t))
>> +                     CRC32CH(crc, 0);
>> +
>> +             if (length & sizeof(uint8_t))
>> +                     CRC32CB(crc, 0);
>> +     } else {
>> +             while ((length -= sizeof(uint64_t)) >= 0) {
>> +                     CRC32CX(crc, *(uint64_t *)buffer);
>> +                     buffer += sizeof(uint64_t);
>> +             }
>> +
>> +             /* The following is more efficient than the straight loop */
>> +             if (length & sizeof(uint32_t)) {
>> +                     CRC32CW(crc, *(uint32_t *)buffer);
>> +                     buffer += sizeof(uint32_t);
>> +             }
>> +             if (length & sizeof(uint16_t)) {
>> +                     CRC32CH(crc, *(uint16_t *)buffer);
>> +                     buffer += sizeof(uint16_t);
>> +             }
>> +             if (length & sizeof(uint8_t))
>> +                     CRC32CB(crc, *buffer);
>> +     }
>> +     return crc;
>> +}
>> +
>> diff --git a/src/common/crc32c_arm64.h b/src/common/crc32c_arm64.h
>> new file mode 100644
>> index 0000000..100b5e4
>> --- /dev/null
>> +++ b/src/common/crc32c_arm64.h
>> @@ -0,0 +1,25 @@
>> +#ifndef CEPH_COMMON_CRC32C_ARM64_H
>> +#define CEPH_COMMON_CRC32C_ARM64_H
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +#ifdef __aarch64__
>> +
>> +extern uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const 
>> *buffer, unsigned len);
>> +
>> +#else
>> +
>> +static inline uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const 
>> *buffer, unsigned len)
>> +{
>> +     return 0;
>> +}
>> +
>> +#endif
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif
>> diff --git a/src/test/common/test_crc32c.cc b/src/test/common/test_crc32c.cc
>> index b4297c6..1a9b2e2 100644
>> --- a/src/test/common/test_crc32c.cc
>> +++ b/src/test/common/test_crc32c.cc
>> @@ -13,6 +13,7 @@
>>
>>  #include "common/sctp_crc32.h"
>>  #include "common/crc32c_intel_baseline.h"
>> +#include "common/crc32c_arm64.h"
>>
>>  TEST(Crc32c, Small) {
>>    const char *a = "foo bar baz";
>> @@ -80,6 +81,14 @@ TEST(Crc32c, Performance) {
>>      std::cout << "intel baseline = " << rate << " MB/sec" << std::endl;
>>      ASSERT_EQ(261108528u, val);
>>    }
>> +  {
>> +    utime_t start = ceph_clock_now(NULL);
>> +    unsigned val = ceph_crc32c_arm64(0, (unsigned char *)a, len);
>> +    utime_t end = ceph_clock_now(NULL);
>> +    float rate = (float)len / (float)(1024*1024) / (float)(end - start);
>> +    std::cout << "arm64 baseline = " << rate << " MB/sec" << std::endl;
>> +    ASSERT_EQ(261108528u, val);
>> +  }
>>
>
> What about x86? :-)
>
Unfortunately, I don't have an x86 system with SSE4, but I'll try to
find one and test this for reference.
Also, I'm adding a check here to remove the test if HAVE_ARMV8_CRC is
not defined.
>
>>  }
>>
>> --
>> 2.2.0
>>

_______________________________________________
linaro-dev mailing list
linaro-dev@lists.linaro.org
http://lists.linaro.org/mailman/listinfo/linaro-dev

Reply via email to