Updated the patch based on the comments.

On 2023/6/15 18:30, John Naylor wrote:

On Wed, Jun 14, 2023 at 9:20 AM YANG Xudong <yangxud...@ymatrix.cn <mailto:yangxud...@ymatrix.cn>> wrote:
 >
 > Attached a new patch with fixes based on the comment below.

Note: It's helpful to pass "-v" to git format-patch, to have different versions.


Added v2

 > > For x86 and Arm, if it fails to link without an -march flag, we allow
 > > for a runtime check. The flags "-march=armv8-a+crc" and "-msse4.2" are
 > > for instructions not found on all platforms. The patch also checks both
 > > ways, and each one results in "Use LoongArch CRC instruction
 > > unconditionally". The -march flag here is general, not specific. In
> > other words, if this only runs inside "+elif host_cpu == 'loongarch64'",
 > > why do we need both with -march and without?
 > >
 >
 > Removed the elif branch.

Okay, since we've confirmed that no arch flag is necessary, some other places can be simplified:

--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -98,6 +98,11 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
  pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
  pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)

+# all versions of pg_crc32c_loongarch.o need CFLAGS_CRC
+pg_crc32c_loongarch.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_shlib.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_srv.o: CFLAGS+=$(CFLAGS_CRC)

This was copy-and-pasted from platforms that use a runtime check, so should be unnecessary.


Removed these lines.

+# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics,
+# and CFLAGS_CRC.

+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+# CFLAGS_CRC is set if the extra flag is required.

Same here -- it seems we don't need to set CFLAGS_CRC at all. Can you confirm?


We don't need to set CFLAGS_CRC as commented. I have updated the configure script to make it align with the logic in meson build script.

 > > Also, I don't have a Loongarch machine for testing. Could you show that
 > > the instructions are found in the binary, maybe using objdump and grep?
 > > Or a performance test?
 > >
 >
 > The output of the objdump command `objdump -dS
 > ../postgres-build/tmp_install/usr/local/pgsql/bin/postgres  | grep -B 30
 > -A 10 crcc` is attached.

Thanks for confirming.

--
John Naylor
EDB: http://www.enterprisedb.com <http://www.enterprisedb.com>
From b2329478b5331e2aa9942c7ee4e23e3bfa871c1d Mon Sep 17 00:00:00 2001
From: YANG Xudong <yangxud...@ymatrix.cn>
Date: Fri, 16 Jun 2023 09:22:08 +0800
Subject: [PATCH v2] Add loongarch native checksum implementation.

---
 config/c-compiler.m4           | 34 ++++++++++++++++
 configure                      | 73 ++++++++++++++++++++++++++++++----
 configure.ac                   | 33 +++++++++++----
 meson.build                    | 24 +++++++++++
 src/include/pg_config.h.in     |  3 ++
 src/include/port/pg_crc32c.h   |  9 +++++
 src/port/meson.build           |  3 ++
 src/port/pg_crc32c_loongarch.c | 72 +++++++++++++++++++++++++++++++++
 8 files changed, 236 insertions(+), 15 deletions(-)
 create mode 100644 src/port/pg_crc32c_loongarch.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 5be8f0f08d..eb3af009c4 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -661,3 +661,37 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_ARMV8_CRC32C_INTRINSICS
+
+# PGAC_LOONGARCH_CRC32C_INTRINSICS
+# ---------------------------
+# Check if the compiler supports the LoongArch CRCC instructions, using
+# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w,
+# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w
+# intrinsic functions.
+#
+# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics,
+# and CFLAGS_CRC.
+AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics_$1])])dnl
+AC_CACHE_CHECK(
+  [for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, 
__builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with 
CFLAGS=$1],
+  [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+  [unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CFLAGS_CRC="$1"
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_LOONGARCH_CRC32C_INTRINSICS
diff --git a/configure b/configure
index 1b415142d1..7c60a26c11 100755
--- a/configure
+++ b/configure
@@ -18114,6 +18114,51 @@ fi
 
 fi
 
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for 
__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, 
__builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=" 
>&5
+$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, 
__builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and 
__builtin_loongarch_crcc_w_d_w with CFLAGS=... " >&6; }
+if ${pgac_cv_loongarch_crc32c_intrinsics_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_loongarch_crc32c_intrinsics_=yes
+else
+  pgac_cv_loongarch_crc32c_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 
$pgac_cv_loongarch_crc32c_intrinsics_" >&5
+$as_echo "$pgac_cv_loongarch_crc32c_intrinsics_" >&6; }
+if test x"$pgac_cv_loongarch_crc32c_intrinsics_" = x"yes"; then
+  CFLAGS_CRC=""
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+
+
 
 
 # Select CRC-32C implementation.
@@ -18132,7 +18177,7 @@ fi
 #
 # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" 
&& test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test 
x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = 
x""; then
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" 
&& test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test 
x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = 
x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" 
= x"1" ; then
     USE_SSE42_CRC32C=1
@@ -18150,10 +18195,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test 
x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-       fi
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            # LoongArch CRCC instructions.
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
       fi
     fi
   fi
@@ -18194,12 +18244,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 
1" >>confdefs.h
         { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC 
instructions with runtime check" >&5
 $as_echo "ARMv8 CRC instructions with runtime check" >&6; }
       else
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+
+$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
+
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC 
instructions" >&5
+$as_echo "LoongArch CRCC instructions" >&6; }
+        else
 
 $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
 
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
 $as_echo "slicing-by-8" >&6; }
+        fi
       fi
     fi
   fi
diff --git a/configure.ac b/configure.ac
index 09558ada0f..d44e9f9ef8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2126,6 +2126,12 @@ if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then
   PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc])
 fi
 
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+PGAC_LOONGARCH_CRC32C_INTRINSICS([])
+
 AC_SUBST(CFLAGS_CRC)
 
 # Select CRC-32C implementation.
@@ -2144,7 +2150,7 @@ AC_SUBST(CFLAGS_CRC)
 #
 # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" 
&& test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test 
x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = 
x""; then
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" 
&& test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test 
x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = 
x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" 
= x"1" ; then
     USE_SSE42_CRC32C=1
@@ -2162,10 +2168,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test 
x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-       fi
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            # LoongArch CRCC instructions.
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
       fi
     fi
   fi
@@ -2193,9 +2204,15 @@ else
         PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o 
pg_crc32c_armv8_choose.o"
         AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
       else
-        AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software 
CRC-32C implementation (slicing-by-8).])
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        AC_MSG_RESULT(slicing-by-8)
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+          AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch 
CRCC instructions.])
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          AC_MSG_RESULT(LoongArch CRCC instructions)
+        else
+          AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software 
CRC-32C implementation (slicing-by-8).])
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          AC_MSG_RESULT(slicing-by-8)
+        fi
       fi
     fi
   fi
diff --git a/meson.build b/meson.build
index 82f2782673..77a9b985df 100644
--- a/meson.build
+++ b/meson.build
@@ -2073,6 +2073,30 @@ int main(void)
     cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
     have_optimized_crc = true
   endif
+
+elif host_cpu == 'loongarch64'
+
+  prog = '''
+int main(void)
+{
+    unsigned int crc = 0;
+    crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+
+    /* return computed value, to prevent the above being optimized away */
+    return crc == 0;
+}
+'''
+
+  if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, 
__builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and 
__builtin_loongarch_crcc_w_d_w',
+      args: test_c_args)
+    # Use LoongArch CRC instruction unconditionally
+    cdata.set('USE_LOONGARCH_CRC32C', 1)
+    have_optimized_crc = true
+  endif
+
 endif
 
 if not have_optimized_crc
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 6d572c3820..1f253566e4 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -724,6 +724,9 @@
 /* Define to 1 to build with LLVM based JIT support. (--with-llvm) */
 #undef USE_LLVM
 
+/* Define to 1 to use LoongArch CRCC instructions. */
+#undef USE_LOONGARCH_CRC32C
+
 /* Define to 1 to build with LZ4 support. (--with-lz4) */
 #undef USE_LZ4
 
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 7f8779261c..d085f1dc00 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const 
void *data, size_t le
 
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t 
len);
 
+#elif defined(USE_LOONGARCH_CRC32C)
+/* Use LoongArch CRCC instructions. */
+
+#define COMP_CRC32C(crc, data, len)                                            
        \
+       ((crc) = pg_comp_crc32c_loongarch((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, 
size_t len);
+
 #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || 
defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
diff --git a/src/port/meson.build b/src/port/meson.build
index 24416b9bfc..3e77b2493a 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -92,6 +92,9 @@ replace_funcs_pos = [
   ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
 
+  # loongarch
+  ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
+
   # generic fallback
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
diff --git a/src/port/pg_crc32c_loongarch.c b/src/port/pg_crc32c_loongarch.c
new file mode 100644
index 0000000000..b54fe12ff5
--- /dev/null
+++ b/src/port/pg_crc32c_loongarch.c
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_loongarch.c
+ *       Compute CRC-32C checksum using LoongArch CRCC instructions
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *       src/port/pg_crc32c_loongarch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "port/pg_crc32c.h"
+
+pg_crc32c
+pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len)
+{
+       const unsigned char *p = data;
+       const unsigned char *pend = p + len;
+
+       /*
+        * Aligned memory access is significantly faster.
+        * Process leading bytes so that the loop below starts with a pointer 
aligned to eight bytes.
+        */
+       if (!PointerIsAligned(p, uint16) &&
+               p + 1 <= pend)
+       {
+               crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+               p += 1;
+       }
+       if (!PointerIsAligned(p, uint32) &&
+               p + 2 <= pend)
+       {
+               crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+               p += 2;
+       }
+       if (!PointerIsAligned(p, uint64) &&
+               p + 4 <= pend)
+       {
+               crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+               p += 4;
+       }
+
+       /* Process eight bytes at a time, as far as we can. */
+       while (p + 8 <= pend)
+       {
+               crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc);
+               p += 8;
+       }
+
+       /* Process remaining 0-7 bytes. */
+       if (p + 4 <= pend)
+       {
+               crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+               p += 4;
+       }
+       if (p + 2 <= pend)
+       {
+               crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+               p += 2;
+       }
+       if (p < pend)
+       {
+               crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+       }
+
+       return crc;
+}
-- 
2.41.0

Reply via email to