Hi Yazen, I have a few comments below.
Cheers, -- Steve On Wed, Jan 21, 2015 at 02:17:47PM -0600, Yazen Ghannam wrote: > Tested on AMD Seattle. > Passes all crc32c unit tests. > ~4x performance increase versus sctp. Was that perf result from the unit test? Also describe the reason for the patch in the commit log; we have an optional CRC instruction that can be used instead of the Ceph in-built table lookup. This patch uses the CRC instruction where available... Unit tests are added and they show a ... Probably also mention that this based off Ed Nevill's Hadoop patch. > > Signed-off-by: Yazen Ghannam <yazen.ghan...@linaro.org> > --- > configure.ac | 1 + > m4/ax_arm.m4 | 18 +++++++++++++--- > src/arch/arm.c | 2 ++ > src/arch/arm.h | 1 + > src/common/Makefile.am | 11 ++++++++-- > src/common/crc32c.cc | 6 ++++++ > src/common/crc32c_arm64.c | 48 > ++++++++++++++++++++++++++++++++++++++++++ > src/common/crc32c_arm64.h | 25 ++++++++++++++++++++++ > src/test/common/test_crc32c.cc | 9 ++++++++ > 9 files changed, 116 insertions(+), 5 deletions(-) > create mode 100644 src/common/crc32c_arm64.c > create mode 100644 src/common/crc32c_arm64.h > > diff --git a/configure.ac b/configure.ac > index d836b02..60e4feb 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -575,6 +575,7 @@ AC_LANG_POP([C++]) > # Find supported SIMD / NEON / SSE extensions supported by the compiler > AX_ARM_FEATURES() > AM_CONDITIONAL(HAVE_NEON, [ test "x$ax_cv_support_neon_ext" = "xyes"]) > +AM_CONDITIONAL(HAVE_ARMV8_CRC, [ test "x$ax_cv_support_crc_ext" = "xyes"]) > AX_INTEL_FEATURES() > AM_CONDITIONAL(HAVE_SSSE3, [ test "x$ax_cv_support_ssse3_ext" = "xyes"]) > AM_CONDITIONAL(HAVE_SSE4_PCLMUL, [ test "x$ax_cv_support_pclmuldq_ext" = > "xyes"]) > diff --git a/m4/ax_arm.m4 b/m4/ax_arm.m4 > index 2ccc9a9..4ef8081 100644 > --- a/m4/ax_arm.m4 > +++ b/m4/ax_arm.m4 > @@ -13,13 +13,25 @@ AC_DEFUN([AX_ARM_FEATURES], > fi > ;; > aarch64*) > + AX_CHECK_COMPILE_FLAG(-march=armv8-a, ax_cv_support_armv8=yes, []) > + if test x"$ax_cv_support_armv8" = x"yes"; then > + ARM_ARCH_FLAGS="-march=armv8-a" > + ARM_DEFINE_FLAGS="-DARCH_AARCH64" > + fi > AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, ax_cv_support_neon_ext=yes, > []) > if test x"$ax_cv_support_neon_ext" = x"yes"; then > - ARM_NEON_FLAGS="-march=armv8-a+simd -DARCH_AARCH64 -DARM_NEON" > - AC_SUBST(ARM_NEON_FLAGS) > - ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS" > + ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+simd" > + ARM_DEFINE_FLAGS="$ARM_DEFINE_FLAGS -DARM_NEON" > AC_DEFINE(HAVE_NEON,,[Support NEON instructions]) > fi > + AX_CHECK_COMPILE_FLAG(-march=armv8-a+crc, ax_cv_support_crc_ext=yes, > []) > + if test x"$ax_cv_support_crc_ext" = x"yes"; then > + ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+crc" > + AC_DEFINE(HAVE_ARMV8_CRC,,[Support ARMv8 CRC instructions]) > + fi > + ARM_NEON_FLAGS="$ARM_ARCH_FLAGS $ARM_DEFINE_FLAGS" > + AC_SUBST(ARM_NEON_FLAGS) > + ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS" So if the compiler supports CRC, then ARM_FLAGS will be set with CRC compile options. This will also affect the jerasure codegen (which doesn't check for the CRC HWCAP). Although this is unlikely to cause problems, I would feel more comfortable with something like: ARM_CRC_FLAGS That way you disambiguate between NEON (for jerasure) and CRC. > ;; > esac > > diff --git a/src/arch/arm.c b/src/arch/arm.c > index 93d079a..2ea97eb 100644 > --- a/src/arch/arm.c > +++ b/src/arch/arm.c > @@ -2,6 +2,7 @@ > > /* flags we export */ > int ceph_arch_neon = 0; > +int ceph_arch_arm64_crc32 = 0; <bikeshedding-comment> Can we rename this variable: ceph_arch_aarch64_crc32 ? </bikeshedding-comment> > > #include <stdio.h> > > @@ -47,6 +48,7 @@ int ceph_arch_arm_probe(void) > ceph_arch_neon = (get_hwcap() & HWCAP_NEON) == HWCAP_NEON; > #elif __aarch64__ && __linux__ > ceph_arch_neon = (get_hwcap() & HWCAP_ASIMD) == HWCAP_ASIMD; > + ceph_arch_arm64_crc32 = (get_hwcap() & HWCAP_CRC32) == HWCAP_CRC32; > #else > if (0) > get_hwcap(); // make compiler shut up > diff --git a/src/arch/arm.h b/src/arch/arm.h > index f613438..4ac716a 100644 > --- a/src/arch/arm.h > +++ b/src/arch/arm.h > @@ -6,6 +6,7 @@ extern "C" { > #endif > > extern int ceph_arch_neon; /* true if we have ARM NEON or ASIMD abilities */ > +extern int ceph_arch_arm64_crc32; /* true if we have Aarch64 CRC32/CRC32C > abilities */ "AArch64", we have two capital A's. > > extern int ceph_arch_arm_probe(void); > > diff --git a/src/common/Makefile.am b/src/common/Makefile.am > index 2888194..4c36216 100644 > --- a/src/common/Makefile.am > +++ b/src/common/Makefile.am > @@ -103,12 +103,18 @@ libcommon_crc_la_SOURCES = \ > common/sctp_crc32.c \ > common/crc32c.cc \ > common/crc32c_intel_baseline.c \ > - common/crc32c_intel_fast.c > + common/crc32c_intel_fast.c \ > + common/crc32c_arm64.c I would recommend crc32c_aarch64.c here too. > > if WITH_GOOD_YASM_ELF64 > libcommon_crc_la_SOURCES += common/crc32c_intel_fast_asm.S > common/crc32c_intel_fast_zero_asm.S > libcommon_crc_la_LIBTOOLFLAGS = --tag=CC > endif > + > +if HAVE_ARMV8_CRC > +libcommon_crc_la_CFLAGS = $(AM_CFLAGS) $(ARM_FLAGS) > +endif > + The problem with this is that all the files in the libcommon_crc_la_SOURCES will be compiled with this flag. We want to limit the scope to just the ARM CRC file. > LIBCOMMON_DEPS += libcommon_crc.la > noinst_LTLIBRARIES += libcommon_crc.la > > @@ -116,7 +122,8 @@ noinst_HEADERS += \ > common/bloom_filter.hpp \ > common/sctp_crc32.h \ > common/crc32c_intel_baseline.h \ > - common/crc32c_intel_fast.h > + common/crc32c_intel_fast.h \ > + common/crc32c_arm64.h > > > # important; libmsg before libauth! > diff --git a/src/common/crc32c.cc b/src/common/crc32c.cc > index e2e81a4..540d431 100644 > --- a/src/common/crc32c.cc > +++ b/src/common/crc32c.cc > @@ -5,9 +5,11 @@ > > #include "arch/probe.h" > #include "arch/intel.h" > +#include "arch/arm.h" > #include "common/sctp_crc32.h" > #include "common/crc32c_intel_baseline.h" > #include "common/crc32c_intel_fast.h" > +#include "common/crc32c_arm64.h" Does pulling these ARM header files in affect how this compiles for other architectures? Could you please give this a quick build on x86? > > /* > * choose best implementation based on the CPU architecture. > @@ -24,6 +26,10 @@ ceph_crc32c_func_t ceph_choose_crc32(void) > return ceph_crc32c_intel_fast; > } > > + if (ceph_arch_arm64_crc32){ > + return ceph_crc32c_arm64; > + } > + > // default > return ceph_crc32c_sctp; > } > diff --git a/src/common/crc32c_arm64.c b/src/common/crc32c_arm64.c > new file mode 100644 > index 0000000..6a754fb > --- /dev/null > +++ b/src/common/crc32c_arm64.c > @@ -0,0 +1,48 @@ > +#include "acconfig.h" > +#include "include/int_types.h" > +#include "common/crc32c_arm64.h" > + > +#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], > %x[v]":[c]"+r"(crc):[v]"r"(value)) > +#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], > %w[v]":[c]"+r"(crc):[v]"r"(value)) > +#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], > %w[v]":[c]"+r"(crc):[v]"r"(value)) > +#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], > %w[v]":[c]"+r"(crc):[v]"r"(value)) > + This won't likely build for other architectures. I would recommend something like: #ifdef __aarch64__ CRC routines #else empty functions #endif /* __arch64__ */ > +uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const *buffer, > unsigned len) > +{ > + int64_t length = len; > + > + if (!buffer) { > + > + while ((length -= sizeof(uint64_t)) >= 0) > + CRC32CX(crc, 0); > + > + /* The following is more efficient than the straight loop */ > + if (length & sizeof(uint32_t)) > + CRC32CW(crc, 0); > + > + if (length & sizeof(uint16_t)) > + CRC32CH(crc, 0); > + > + if (length & sizeof(uint8_t)) > + CRC32CB(crc, 0); > + } else { > + while ((length -= sizeof(uint64_t)) >= 0) { > + CRC32CX(crc, *(uint64_t *)buffer); > + buffer += sizeof(uint64_t); > + } > + > + /* The following is more efficient than the straight loop */ > + if (length & sizeof(uint32_t)) { > + CRC32CW(crc, *(uint32_t *)buffer); > + buffer += sizeof(uint32_t); > + } > + if (length & sizeof(uint16_t)) { > + CRC32CH(crc, *(uint16_t *)buffer); > + buffer += sizeof(uint16_t); > + } > + if (length & sizeof(uint8_t)) > + CRC32CB(crc, *buffer); > + } > + return crc; > +} > + > diff --git a/src/common/crc32c_arm64.h b/src/common/crc32c_arm64.h > new file mode 100644 > index 0000000..100b5e4 > --- /dev/null > +++ b/src/common/crc32c_arm64.h > @@ -0,0 +1,25 @@ > +#ifndef CEPH_COMMON_CRC32C_ARM64_H > +#define CEPH_COMMON_CRC32C_ARM64_H > + > +#ifdef __cplusplus > +extern "C" { > +#endif > + > +#ifdef __aarch64__ > + > +extern uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const *buffer, > unsigned len); > + > +#else > + > +static inline uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const > *buffer, unsigned len) > +{ > + return 0; > +} > + > +#endif > + > +#ifdef __cplusplus > +} > +#endif > + > +#endif > diff --git a/src/test/common/test_crc32c.cc b/src/test/common/test_crc32c.cc > index b4297c6..1a9b2e2 100644 > --- a/src/test/common/test_crc32c.cc > +++ b/src/test/common/test_crc32c.cc > @@ -13,6 +13,7 @@ > > #include "common/sctp_crc32.h" > #include "common/crc32c_intel_baseline.h" > +#include "common/crc32c_arm64.h" > > TEST(Crc32c, Small) { > const char *a = "foo bar baz"; > @@ -80,6 +81,14 @@ TEST(Crc32c, Performance) { > std::cout << "intel baseline = " << rate << " MB/sec" << std::endl; > ASSERT_EQ(261108528u, val); > } > + { > + utime_t start = ceph_clock_now(NULL); > + unsigned val = ceph_crc32c_arm64(0, (unsigned char *)a, len); > + utime_t end = ceph_clock_now(NULL); > + float rate = (float)len / (float)(1024*1024) / (float)(end - start); > + std::cout << "arm64 baseline = " << rate << " MB/sec" << std::endl; > + ASSERT_EQ(261108528u, val); > + } > What about x86? :-) > } > > -- > 2.2.0 > _______________________________________________ linaro-dev mailing list linaro-dev@lists.linaro.org http://lists.linaro.org/mailman/listinfo/linaro-dev