On Thu, Jan 22, 2015 at 4:58 AM, Steve Capper <steve.cap...@linaro.org> wrote: > Hi Yazen, > > I have a few comments below. > > Cheers, > -- > Steve > > > On Wed, Jan 21, 2015 at 02:17:47PM -0600, Yazen Ghannam wrote: >> Tested on AMD Seattle. >> Passes all crc32c unit tests. >> ~4x performance increase versus sctp. > > Was that perf result from the unit test? > Yes, this was from the unit test.
> Also describe the reason for the patch in the commit log; we have an > optional CRC instruction that can be used instead of the Ceph in-built > table lookup. This patch uses the CRC instruction where available... > > Unit tests are added and they show a ... > > Probably also mention that this based off Ed Nevill's Hadoop patch. > Will do. >> >> Signed-off-by: Yazen Ghannam <yazen.ghan...@linaro.org> >> --- >> configure.ac | 1 + >> m4/ax_arm.m4 | 18 +++++++++++++--- >> src/arch/arm.c | 2 ++ >> src/arch/arm.h | 1 + >> src/common/Makefile.am | 11 ++++++++-- >> src/common/crc32c.cc | 6 ++++++ >> src/common/crc32c_arm64.c | 48 >> ++++++++++++++++++++++++++++++++++++++++++ >> src/common/crc32c_arm64.h | 25 ++++++++++++++++++++++ >> src/test/common/test_crc32c.cc | 9 ++++++++ >> 9 files changed, 116 insertions(+), 5 deletions(-) >> create mode 100644 src/common/crc32c_arm64.c >> create mode 100644 src/common/crc32c_arm64.h >> >> diff --git a/configure.ac b/configure.ac >> index d836b02..60e4feb 100644 >> --- a/configure.ac >> +++ b/configure.ac >> @@ -575,6 +575,7 @@ AC_LANG_POP([C++]) >> # Find supported SIMD / NEON / SSE extensions supported by the compiler >> AX_ARM_FEATURES() >> AM_CONDITIONAL(HAVE_NEON, [ test "x$ax_cv_support_neon_ext" = "xyes"]) >> +AM_CONDITIONAL(HAVE_ARMV8_CRC, [ test "x$ax_cv_support_crc_ext" = "xyes"]) >> AX_INTEL_FEATURES() >> AM_CONDITIONAL(HAVE_SSSE3, [ test "x$ax_cv_support_ssse3_ext" = "xyes"]) >> AM_CONDITIONAL(HAVE_SSE4_PCLMUL, [ test "x$ax_cv_support_pclmuldq_ext" = >> "xyes"]) >> diff --git a/m4/ax_arm.m4 b/m4/ax_arm.m4 >> index 2ccc9a9..4ef8081 100644 >> --- a/m4/ax_arm.m4 >> +++ b/m4/ax_arm.m4 >> @@ -13,13 +13,25 @@ AC_DEFUN([AX_ARM_FEATURES], >> fi >> ;; >> aarch64*) >> + AX_CHECK_COMPILE_FLAG(-march=armv8-a, ax_cv_support_armv8=yes, []) >> + if test x"$ax_cv_support_armv8" = x"yes"; then >> + ARM_ARCH_FLAGS="-march=armv8-a" >> + ARM_DEFINE_FLAGS="-DARCH_AARCH64" >> + fi >> AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, >> ax_cv_support_neon_ext=yes, []) >> if test x"$ax_cv_support_neon_ext" = x"yes"; then >> - ARM_NEON_FLAGS="-march=armv8-a+simd -DARCH_AARCH64 -DARM_NEON" >> - AC_SUBST(ARM_NEON_FLAGS) >> - ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS" >> + ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+simd" >> + ARM_DEFINE_FLAGS="$ARM_DEFINE_FLAGS -DARM_NEON" >> AC_DEFINE(HAVE_NEON,,[Support NEON instructions]) >> fi >> + AX_CHECK_COMPILE_FLAG(-march=armv8-a+crc, ax_cv_support_crc_ext=yes, >> []) >> + if test x"$ax_cv_support_crc_ext" = x"yes"; then >> + ARM_ARCH_FLAGS="$ARM_ARCH_FLAGS+crc" >> + AC_DEFINE(HAVE_ARMV8_CRC,,[Support ARMv8 CRC instructions]) >> + fi >> + ARM_NEON_FLAGS="$ARM_ARCH_FLAGS $ARM_DEFINE_FLAGS" >> + AC_SUBST(ARM_NEON_FLAGS) >> + ARM_FLAGS="$ARM_FLAGS $ARM_NEON_FLAGS" > > So if the compiler supports CRC, then ARM_FLAGS will be set with CRC > compile options. This will also affect the jerasure codegen (which > doesn't check for the CRC HWCAP). Although this is unlikely to cause > problems, I would feel more comfortable with something like: > ARM_CRC_FLAGS > That way you disambiguate between NEON (for jerasure) and CRC. > It looks to me that jerasure only uses ARM_NEON_FLAGS. What I'll do is have minimal flags for each feature (e.g. NEON and CRC) and put all the available flags in ARM_FLAGS. >> ;; >> esac >> >> diff --git a/src/arch/arm.c b/src/arch/arm.c >> index 93d079a..2ea97eb 100644 >> --- a/src/arch/arm.c >> +++ b/src/arch/arm.c >> @@ -2,6 +2,7 @@ >> >> /* flags we export */ >> int ceph_arch_neon = 0; >> +int ceph_arch_arm64_crc32 = 0; > > <bikeshedding-comment> > Can we rename this variable: ceph_arch_aarch64_crc32 ? > </bikeshedding-comment> > Will do. >> >> #include <stdio.h> >> >> @@ -47,6 +48,7 @@ int ceph_arch_arm_probe(void) >> ceph_arch_neon = (get_hwcap() & HWCAP_NEON) == HWCAP_NEON; >> #elif __aarch64__ && __linux__ >> ceph_arch_neon = (get_hwcap() & HWCAP_ASIMD) == HWCAP_ASIMD; >> + ceph_arch_arm64_crc32 = (get_hwcap() & HWCAP_CRC32) == HWCAP_CRC32; >> #else >> if (0) >> get_hwcap(); // make compiler shut up >> diff --git a/src/arch/arm.h b/src/arch/arm.h >> index f613438..4ac716a 100644 >> --- a/src/arch/arm.h >> +++ b/src/arch/arm.h >> @@ -6,6 +6,7 @@ extern "C" { >> #endif >> >> extern int ceph_arch_neon; /* true if we have ARM NEON or ASIMD abilities >> */ >> +extern int ceph_arch_arm64_crc32; /* true if we have Aarch64 CRC32/CRC32C >> abilities */ > > "AArch64", we have two capital A's. > Ack. >> >> extern int ceph_arch_arm_probe(void); >> >> diff --git a/src/common/Makefile.am b/src/common/Makefile.am >> index 2888194..4c36216 100644 >> --- a/src/common/Makefile.am >> +++ b/src/common/Makefile.am >> @@ -103,12 +103,18 @@ libcommon_crc_la_SOURCES = \ >> common/sctp_crc32.c \ >> common/crc32c.cc \ >> common/crc32c_intel_baseline.c \ >> - common/crc32c_intel_fast.c >> + common/crc32c_intel_fast.c \ >> + common/crc32c_arm64.c > > I would recommend crc32c_aarch64.c here too. > Ack. >> >> if WITH_GOOD_YASM_ELF64 >> libcommon_crc_la_SOURCES += common/crc32c_intel_fast_asm.S >> common/crc32c_intel_fast_zero_asm.S >> libcommon_crc_la_LIBTOOLFLAGS = --tag=CC >> endif >> + >> +if HAVE_ARMV8_CRC >> +libcommon_crc_la_CFLAGS = $(AM_CFLAGS) $(ARM_FLAGS) >> +endif >> + > > The problem with this is that all the files in the > libcommon_crc_la_SOURCES will be compiled with this flag. We want to > limit the scope to just the ARM CRC file. > I don't understand the issue here. Can you please explain? There is a check for HAVE_ARMV8_CRC which is only set if these flags are valid. This seems to work fine on aarch64 and x86. >> LIBCOMMON_DEPS += libcommon_crc.la >> noinst_LTLIBRARIES += libcommon_crc.la >> >> @@ -116,7 +122,8 @@ noinst_HEADERS += \ >> common/bloom_filter.hpp \ >> common/sctp_crc32.h \ >> common/crc32c_intel_baseline.h \ >> - common/crc32c_intel_fast.h >> + common/crc32c_intel_fast.h \ >> + common/crc32c_arm64.h >> >> >> # important; libmsg before libauth! >> diff --git a/src/common/crc32c.cc b/src/common/crc32c.cc >> index e2e81a4..540d431 100644 >> --- a/src/common/crc32c.cc >> +++ b/src/common/crc32c.cc >> @@ -5,9 +5,11 @@ >> >> #include "arch/probe.h" >> #include "arch/intel.h" >> +#include "arch/arm.h" >> #include "common/sctp_crc32.h" >> #include "common/crc32c_intel_baseline.h" >> #include "common/crc32c_intel_fast.h" >> +#include "common/crc32c_arm64.h" > > Does pulling these ARM header files in affect how this compiles for > other architectures? Could you please give this a quick build on > x86? > Builds fine on x86. I followed the same format as the intel files where arch-specific stuff is #ifdef'd out. See next comment. >> >> /* >> * choose best implementation based on the CPU architecture. >> @@ -24,6 +26,10 @@ ceph_crc32c_func_t ceph_choose_crc32(void) >> return ceph_crc32c_intel_fast; >> } >> >> + if (ceph_arch_arm64_crc32){ >> + return ceph_crc32c_arm64; >> + } >> + >> // default >> return ceph_crc32c_sctp; >> } >> diff --git a/src/common/crc32c_arm64.c b/src/common/crc32c_arm64.c >> new file mode 100644 >> index 0000000..6a754fb >> --- /dev/null >> +++ b/src/common/crc32c_arm64.c >> @@ -0,0 +1,48 @@ >> +#include "acconfig.h" >> +#include "include/int_types.h" >> +#include "common/crc32c_arm64.h" >> + >> +#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], >> %x[v]":[c]"+r"(crc):[v]"r"(value)) >> +#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], >> %w[v]":[c]"+r"(crc):[v]"r"(value)) >> +#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], >> %w[v]":[c]"+r"(crc):[v]"r"(value)) >> +#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], >> %w[v]":[c]"+r"(crc):[v]"r"(value)) >> + > > This won't likely build for other architectures. > I would recommend something like: > > #ifdef __aarch64__ > CRC routines > #else > empty functions > #endif /* __arch64__ */ > Yes, you are correct. I will add a check here. The empty function is defined in the header file. >> +uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const *buffer, >> unsigned len) >> +{ >> + int64_t length = len; >> + >> + if (!buffer) { >> + >> + while ((length -= sizeof(uint64_t)) >= 0) >> + CRC32CX(crc, 0); >> + >> + /* The following is more efficient than the straight loop */ >> + if (length & sizeof(uint32_t)) >> + CRC32CW(crc, 0); >> + >> + if (length & sizeof(uint16_t)) >> + CRC32CH(crc, 0); >> + >> + if (length & sizeof(uint8_t)) >> + CRC32CB(crc, 0); >> + } else { >> + while ((length -= sizeof(uint64_t)) >= 0) { >> + CRC32CX(crc, *(uint64_t *)buffer); >> + buffer += sizeof(uint64_t); >> + } >> + >> + /* The following is more efficient than the straight loop */ >> + if (length & sizeof(uint32_t)) { >> + CRC32CW(crc, *(uint32_t *)buffer); >> + buffer += sizeof(uint32_t); >> + } >> + if (length & sizeof(uint16_t)) { >> + CRC32CH(crc, *(uint16_t *)buffer); >> + buffer += sizeof(uint16_t); >> + } >> + if (length & sizeof(uint8_t)) >> + CRC32CB(crc, *buffer); >> + } >> + return crc; >> +} >> + >> diff --git a/src/common/crc32c_arm64.h b/src/common/crc32c_arm64.h >> new file mode 100644 >> index 0000000..100b5e4 >> --- /dev/null >> +++ b/src/common/crc32c_arm64.h >> @@ -0,0 +1,25 @@ >> +#ifndef CEPH_COMMON_CRC32C_ARM64_H >> +#define CEPH_COMMON_CRC32C_ARM64_H >> + >> +#ifdef __cplusplus >> +extern "C" { >> +#endif >> + >> +#ifdef __aarch64__ >> + >> +extern uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const >> *buffer, unsigned len); >> + >> +#else >> + >> +static inline uint32_t ceph_crc32c_arm64(uint32_t crc, unsigned char const >> *buffer, unsigned len) >> +{ >> + return 0; >> +} >> + >> +#endif >> + >> +#ifdef __cplusplus >> +} >> +#endif >> + >> +#endif >> diff --git a/src/test/common/test_crc32c.cc b/src/test/common/test_crc32c.cc >> index b4297c6..1a9b2e2 100644 >> --- a/src/test/common/test_crc32c.cc >> +++ b/src/test/common/test_crc32c.cc >> @@ -13,6 +13,7 @@ >> >> #include "common/sctp_crc32.h" >> #include "common/crc32c_intel_baseline.h" >> +#include "common/crc32c_arm64.h" >> >> TEST(Crc32c, Small) { >> const char *a = "foo bar baz"; >> @@ -80,6 +81,14 @@ TEST(Crc32c, Performance) { >> std::cout << "intel baseline = " << rate << " MB/sec" << std::endl; >> ASSERT_EQ(261108528u, val); >> } >> + { >> + utime_t start = ceph_clock_now(NULL); >> + unsigned val = ceph_crc32c_arm64(0, (unsigned char *)a, len); >> + utime_t end = ceph_clock_now(NULL); >> + float rate = (float)len / (float)(1024*1024) / (float)(end - start); >> + std::cout << "arm64 baseline = " << rate << " MB/sec" << std::endl; >> + ASSERT_EQ(261108528u, val); >> + } >> > > What about x86? :-) > Unfortunately, I don't have an x86 system with SSE4, but I'll try to find one and test this for reference. Also, I'm adding a check here to remove the test if HAVE_ARMV8_CRC is not defined. > >> } >> >> -- >> 2.2.0 >> _______________________________________________ linaro-dev mailing list linaro-dev@lists.linaro.org http://lists.linaro.org/mailman/listinfo/linaro-dev