Hi, I would like to add support for hardware crc for ARM's new 64 bit architecture, aarch64.
I would be grateful if some committer could help me though the process of getting this change pushed into the trunk. I have prepared an initial patch below. The patch is completely conditionalized on __arch64__ For the moment I have only done the non pipelined version as the hw I have only has 1 crc execute unit. Some initial benchmarks on terasort give sw crc: 107 sec hw crc: 103 sec The performance improvement is quite small, but this is limited by the fact that I am using early stage hw which is not performant. I have also built it on x86 and I think the change is fairly safe for other architectures because post conditionalization the src is identical on other architectures. Thanks for you help, Ed. --- CUT HERE --- Index: hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c =================================================================== --- hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c (revision 1605031) +++ hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c (working copy) @@ -38,7 +38,7 @@ #include "bulk_crc32.h" #include "gcc_optimizations.h" -#if (!defined(__FreeBSD__) && !defined(WINDOWS)) +#if (!defined(__FreeBSD__) && !defined(WINDOWS)) && !defined(__aarch64__) #define USE_PIPELINED #endif @@ -672,8 +672,61 @@ # endif // 64-bit vs 32-bit -#else // end x86 architecture +#elif defined(__aarch64__) // end x86 architecture +#include <sys/auxv.h> +#include <asm/hwcap.h> + +#ifndef HWCAP_CRC32 +#define HWCAP_CRC32 (1<<7) +#endif + +/** + * On library load, determine what sort of crc we are going to do + * and set cached_cpu_supports_crc32 appropriately. + */ +void __attribute__ ((constructor)) init_cpu_support_flag(void) { + unsigned long auxv = getauxval(AT_HWCAP); + cached_cpu_supports_crc32 = auxv & HWCAP_CRC32; +} + +#define CRC32X(crc,value) asm("crc32cx %w[c], %w[c], %x[v]" : [c]"+r"(crc) : [v]"r"(value)) +#define CRC32W(crc,value) asm("crc32cw %w[c], %w[c], %w[v]" : [c]"+r"(crc) : [v]"r"(value)) +#define CRC32H(crc,value) asm("crc32ch %w[c], %w[c], %w[v]" : [c]"+r"(crc) : [v]"r"(value)) +#define CRC32B(crc,value) asm("crc32cb %w[c], %w[c], %w[v]" : [c]"+r"(crc) : [v]"r"(value)) + +/** + * Hardware-accelerated CRC32C calculation using the 64-bit instructions. + */ +static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* p_buf, size_t length) { + int64_t len = length; + asm(".cpu generic+crc"); // Allow crc instructions in asm + if ((len -= sizeof(uint64_t)) >= 0) { + do { + CRC32X(crc, *(uint64_t*)p_buf); + p_buf += sizeof(uint64_t); + } while ((len -= sizeof(uint64_t)) >= 0); + } + + // The following is more efficient than the straight loop + if (len & sizeof(uint32_t)) { + CRC32W(crc, *(uint32_t*)p_buf); + p_buf += sizeof(uint32_t); + } + if (len & sizeof(uint16_t)) { + CRC32H(crc, *(uint16_t*)p_buf); + p_buf += sizeof(uint16_t); + } + if (len & sizeof(uint8_t)) { + CRC32B(crc, *p_buf); + p_buf++; + } + + return crc; +} + +#else + static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* data, size_t length) { // never called! assert(0 && "hardware crc called on an unsupported platform"); --- CUT HERE ---