Hi,

I would like to add support for hardware crc for ARM's new 64 bit architecture, 
aarch64.

I would be grateful if some committer could help me though the process of 
getting this change pushed into the trunk.

I have prepared an initial patch below.

The patch is completely conditionalized on __arch64__

For the moment I have only done the non pipelined version as the hw I have only 
has 1 crc execute unit.

Some initial benchmarks on terasort give

sw crc: 107 sec
hw crc: 103 sec

The performance improvement is quite small, but this is limited by the fact 
that I am using early stage hw which is not performant.

I have also built it on x86 and I think the change is fairly safe for other 
architectures because post conditionalization the src is identical on other 
architectures.

Thanks for you help,
Ed.


--- CUT HERE ---
Index: 
hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c
===================================================================
--- 
hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c
 (revision 1605031)
+++ 
hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c
 (working copy)
@@ -38,7 +38,7 @@
 #include "bulk_crc32.h"
 #include "gcc_optimizations.h"
 
-#if (!defined(__FreeBSD__) && !defined(WINDOWS))
+#if (!defined(__FreeBSD__) && !defined(WINDOWS)) && !defined(__aarch64__)
 #define USE_PIPELINED
 #endif
 
@@ -672,8 +672,61 @@
 
 # endif // 64-bit vs 32-bit
 
-#else // end x86 architecture
+#elif defined(__aarch64__) // end x86 architecture
 
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1<<7)
+#endif
+
+/**
+ * On library load, determine what sort of crc we are going to do
+ * and set cached_cpu_supports_crc32 appropriately.
+ */
+void __attribute__ ((constructor)) init_cpu_support_flag(void) {
+  unsigned long auxv = getauxval(AT_HWCAP);
+  cached_cpu_supports_crc32 = auxv & HWCAP_CRC32;
+}
+
+#define CRC32X(crc,value) asm("crc32cx %w[c], %w[c], %x[v]" : [c]"+r"(crc) : 
[v]"r"(value))
+#define CRC32W(crc,value) asm("crc32cw %w[c], %w[c], %w[v]" : [c]"+r"(crc) : 
[v]"r"(value))
+#define CRC32H(crc,value) asm("crc32ch %w[c], %w[c], %w[v]" : [c]"+r"(crc) : 
[v]"r"(value))
+#define CRC32B(crc,value) asm("crc32cb %w[c], %w[c], %w[v]" : [c]"+r"(crc) : 
[v]"r"(value))
+
+/**
+ * Hardware-accelerated CRC32C calculation using the 64-bit instructions.
+ */
+static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* p_buf, size_t 
length) {
+  int64_t len = length;
+  asm(".cpu generic+crc");     // Allow crc instructions in asm
+  if ((len -= sizeof(uint64_t)) >= 0) {
+    do {
+      CRC32X(crc, *(uint64_t*)p_buf);
+      p_buf += sizeof(uint64_t);
+    } while ((len -= sizeof(uint64_t)) >= 0);
+  }
+
+  // The following is more efficient than the straight loop
+  if (len & sizeof(uint32_t)) {
+      CRC32W(crc, *(uint32_t*)p_buf);
+      p_buf += sizeof(uint32_t);
+  }
+  if (len & sizeof(uint16_t)) {
+      CRC32H(crc, *(uint16_t*)p_buf);
+      p_buf += sizeof(uint16_t);
+  }
+  if (len & sizeof(uint8_t)) {
+      CRC32B(crc, *p_buf);
+      p_buf++;
+  }
+
+  return crc;
+}
+
+#else
+
 static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* data, size_t 
length) {
   // never called!
   assert(0 && "hardware crc called on an unsupported platform");
--- CUT HERE ---


Reply via email to