On Sat, Mar 13, 2021 at 4:37 AM John Naylor
<john.nay...@enterprisedb.com> wrote:
> On Fri, Mar 12, 2021 at 9:14 AM Amit Khandekar <amitdkhan...@gmail.com> wrote:
> > I was not thinking about auto-vectorizing the code in
> > pg_validate_utf8_sse42(). Rather, I was considering auto-vectorization
> > inside the individual helper functions that you wrote, such as
> > _mm_setr_epi8(), shift_right(), bitwise_and(), prev1(), splat(),
>
> If the PhD holders who came up with this algorithm thought it possible to do 
> it that way, I'm sure they would have. In reality, simdjson has different 
> files for SSE4, AVX, AVX512, NEON, and Altivec. We can incorporate any of 
> those as needed. That's a PG15 project, though, and I'm not volunteering.

Just for fun/experimentation, here's a quick (and probably too naive)
translation of those helper functions to NEON, on top of the v15
patch.
From 1464c22da33117900341496d03b92dec5be2a62a Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Thu, 22 Jul 2021 02:05:06 +1200
Subject: [PATCH 1/2] XXX Make SIMD code more platform neutral.

Move SIMD code into pg_utf_simd.c to experiment with the idea of a
shared implementation across architectures.  Introduce pg_u8x16_t to
abstract vector type.

XXX Experiment grade code only
---
 configure                                     |  18 +--
 configure.ac                                  |  18 +--
 src/include/pg_config.h.in                    |  14 +-
 src/include/port/pg_utf8.h                    |  10 +-
 src/port/Makefile                             |   8 +-
 ...g_utf8_sse42_choose.c => pg_utf8_choose.c} |  10 +-
 src/port/{pg_utf8_sse42.c => pg_utf8_simd.c}  | 148 +++++++++---------
 7 files changed, 114 insertions(+), 112 deletions(-)
 rename src/port/{pg_utf8_sse42_choose.c => pg_utf8_choose.c} (88%)
 rename src/port/{pg_utf8_sse42.c => pg_utf8_simd.c} (76%)

diff --git a/configure b/configure
index 30969840b1..df546b641c 100755
--- a/configure
+++ b/configure
@@ -18442,13 +18442,13 @@ fi
 #
 # You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
 # in the template or configure command line.
-if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" 
= x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+if test x"$USE_SIMD_UTF8" = x"" && test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = 
x"" && test x"$USE_FALLBACK_UTF8" = x""; then
   if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = 
x"1" ; then
-    USE_SSE42_UTF8=1
+    USE_SIMD_UTF8=1
   else
     # the CPUID instruction is needed for the runtime check.
     if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" 
= x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
-      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+      USE_SIMD_UTF8_WITH_RUNTIME_CHECK=1
     else
       # fall back to algorithm which doesn't require any special
       # CPU support.
@@ -18461,19 +18461,19 @@ fi
 # Note: We need the fallback for error handling in all builds.
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to 
use" >&5
 $as_echo_n "checking which UTF-8 validator to use... " >&6; }
-if test x"$USE_SSE42_UTF8" = x"1"; then
+if test x"$USE_SIMD_UTF8" = x"1"; then
 
-$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+$as_echo "#define USE_SIMD_UTF8 1" >>confdefs.h
 
-  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o"
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
 $as_echo "SSE 4.2" >&6; }
 else
-  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+  if test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
 
-$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+$as_echo "#define USE_SIMD_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
 
-    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o pg_utf8_choose.o"
     { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime 
check" >&5
 $as_echo "SSE 4.2 with runtime check" >&6; }
   else
diff --git a/configure.ac b/configure.ac
index 5e2b4717c1..1606a80fb7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2217,13 +2217,13 @@ AC_SUBST(PG_CRC32C_OBJS)
 #
 # You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
 # in the template or configure command line.
-if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" 
= x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+if test x"$USE_SIMD_UTF8" = x"" && test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = 
x"" && test x"$USE_FALLBACK_UTF8" = x""; then
   if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = 
x"1" ; then
-    USE_SSE42_UTF8=1
+    USE_SIMD_UTF8=1
   else
     # the CPUID instruction is needed for the runtime check.
     if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" 
= x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
-      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+      USE_SIMD_UTF8_WITH_RUNTIME_CHECK=1
     else
       # fall back to algorithm which doesn't require any special
       # CPU support.
@@ -2235,14 +2235,14 @@ fi
 # Set PG_UTF8_OBJS appropriately depending on the selected implementation.
 # Note: We need the fallback for error handling in all builds.
 AC_MSG_CHECKING([which UTF-8 validator to use])
-if test x"$USE_SSE42_UTF8" = x"1"; then
-  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
-  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+if test x"$USE_SIMD_UTF8" = x"1"; then
+  AC_DEFINE(USE_SIMD_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o"
   AC_MSG_RESULT(SSE 4.2)
 else
-  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
-    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel 
SSE 4.2 instructions with a runtime check.])
-    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+  if test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SIMD_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel 
SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o pg_utf8_choose.o"
     AC_MSG_RESULT(SSE 4.2 with runtime check)
   else
     AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 
instructions with a runtime check.])
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 9d5e1efda9..f1456553e6 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -904,6 +904,9 @@
 /* Define to 1 to build with BSD Authentication support. (--with-bsd-auth) */
 #undef USE_BSD_AUTH
 
+/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */
+#undef USE_FALLBACK_UTF8
+
 /* Define to build with ICU support. (--with-icu) */
 #undef USE_ICU
 
@@ -932,14 +935,11 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
-/* Define to 1 to use the fallback UTF-8 validator written in C. */
-#undef USE_FALLBACK_UTF8
-
-/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
-#undef USE_SSE42_UTF8
+/* Define to 1 use Intel SSE 4.2 instructions. */
+#undef USE_SIMD_UTF8
 
-/* Define to 1 use the UTF-8 validator written with Intel SSE instructions 
with runtime check. */
-#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */
+#undef USE_SIMD_UTF8_WITH_RUNTIME_CHECK
 
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
index dc38369a31..a9f2b9f15b 100644
--- a/src/include/port/pg_utf8.h
+++ b/src/include/port/pg_utf8.h
@@ -15,14 +15,14 @@
 #define PG_UTF8_H
 
 
-#if defined(USE_SSE42_UTF8)
+#if defined(USE_SIMD_UTF8)
 /* Use Intel SSE4.2 instructions. */
 #define UTF8_VERIFYSTR(s, len) \
-       pg_validate_utf8_sse42((s), (len))
+       pg_validate_utf8_simd((s), (len))
 
-extern int     pg_validate_utf8_sse42(const unsigned char *s, int len);
+extern int     pg_validate_utf8_simd(const unsigned char *s, int len);
 
-#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+#elif defined(USE_SIMD_UTF8_WITH_RUNTIME_CHECK)
 /*
  * Use Intel SSE 4.2 instructions, but perform a runtime check first
  * to check that they are available.
@@ -31,7 +31,7 @@ extern int    pg_validate_utf8_sse42(const unsigned char *s, 
int len);
        pg_validate_utf8((s), (len))
 
 extern int     (*pg_validate_utf8) (const unsigned char *s, int len);
-extern int     pg_validate_utf8_sse42(const unsigned char *s, int len);
+extern int     pg_validate_utf8_simd(const unsigned char *s, int len);
 
 #else
 #define UTF8_VERIFYSTR(s, len) \
diff --git a/src/port/Makefile b/src/port/Makefile
index 04838b0ab2..893fcd7d59 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -90,10 +90,10 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
-# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
-pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
-pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
-pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+# all versions of pg_utf8_simd.o need CFLAGS_SSE42
+pg_utf8_simd.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_simd_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_simd_srv.o: CFLAGS+=$(CFLAGS_SSE42)
 
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_choose.c
similarity index 88%
rename from src/port/pg_utf8_sse42_choose.c
rename to src/port/pg_utf8_choose.c
index ff6120be2b..140c0dce7b 100644
--- a/src/port/pg_utf8_sse42_choose.c
+++ b/src/port/pg_utf8_choose.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
- * pg_utf8_sse42_choose.c
- *       Choose between Intel SSE 4.2 and fallback implementation.
+ * pg_utf8_choose.c
+ *       Choose between SSE 4.2 and fallback implementation.
  *
  * On first call, checks if the CPU we're running on supports Intel SSE
  * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
@@ -30,7 +30,7 @@
 #include "port/pg_utf8.h"
 
 static bool
-pg_utf8_sse42_available(void)
+pg_utf8_simd_available(void)
 {
        /* To save from checking every SSE2 intrinsic, insist on 64-bit. */
 #ifdef __x86_64__
@@ -57,8 +57,8 @@ pg_utf8_sse42_available(void)
 static int
 pg_validate_utf8_choose(const unsigned char *s, int len)
 {
-       if (pg_utf8_sse42_available())
-               pg_validate_utf8 = pg_validate_utf8_sse42;
+       if (pg_utf8_simd_available())
+               pg_validate_utf8 = pg_validate_utf8_simd;
        else
                pg_validate_utf8 = pg_validate_utf8_fallback;
 
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_simd.c
similarity index 76%
rename from src/port/pg_utf8_sse42.c
rename to src/port/pg_utf8_simd.c
index cd050ec2bf..7ca9060e3a 100644
--- a/src/port/pg_utf8_sse42.c
+++ b/src/port/pg_utf8_simd.c
@@ -1,6 +1,6 @@
 /*-------------------------------------------------------------------------
  *
- * pg_utf8_sse42.c
+ * pg_utf8_simd.c
  *       Validate UTF-8 using Intel SSE 4.2 instructions.
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       src/port/pg_utf8_sse42.c
+ *       src/port/pg_utf8_simd.c
  *
  *-------------------------------------------------------------------------
  */
@@ -19,6 +19,8 @@
 
 #include "port/pg_utf8.h"
 
+typedef __m128i pg_u8x16_t;
+
 /*
  * This module is based on the paper "Validating UTF-8 In Less Than One
  * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090
@@ -184,48 +186,48 @@
 #define vset(...)              _mm_setr_epi8(__VA_ARGS__)
 
 /* return a zeroed register */
-static inline const __m128i
+static inline const pg_u8x16_t
 vzero()
 {
        return _mm_setzero_si128();
 }
 
 /* perform an unaligned load from memory into a register */
-static inline const __m128i
+static inline const pg_u8x16_t
 vload(const unsigned char *raw_input)
 {
-       return _mm_loadu_si128((const __m128i *) raw_input);
+       return _mm_loadu_si128((const pg_u8x16_t *) raw_input);
 }
 
 /* return a vector with each 8-bit lane populated with the input scalar */
-static inline __m128i
+static inline pg_u8x16_t
 splat(char byte)
 {
        return _mm_set1_epi8(byte);
 }
 
 /* perform signed greater-than on all 8-bit lanes */
-static inline __m128i
-greater_than(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+greater_than(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
        return _mm_cmpgt_epi8(v1, v2);
 }
 
 /* bitwise vector operations */
-static inline __m128i
-bitwise_and(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+bitwise_and(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
        return _mm_and_si128(v1, v2);
 }
 
-static inline __m128i
-bitwise_or(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+bitwise_or(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
        return _mm_or_si128(v1, v2);
 }
 
-static inline __m128i
-bitwise_xor(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
        return _mm_xor_si128(v1, v2);
 }
@@ -235,8 +237,8 @@ bitwise_xor(const __m128i v1, const __m128i v2)
  * on overflow, stop at zero. Useful for emulating unsigned
  * comparison.
  */
-static inline __m128i
-saturating_sub(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+saturating_sub(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
        return _mm_subs_epu8(v1, v2);
 }
@@ -247,11 +249,11 @@ saturating_sub(const __m128i v1, const __m128i v2)
  * There is no intrinsic to do this on 8-bit lanes, so shift right in each
  * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
  */
-static inline __m128i
-shift_right(const __m128i v, const int n)
+static inline pg_u8x16_t
+shift_right(const pg_u8x16_t v, const int n)
 {
-       const           __m128i shift16 = _mm_srli_epi16(v, n);
-       const           __m128i mask = splat(0xFF >> n);
+       const           pg_u8x16_t shift16 = _mm_srli_epi16(v, n);
+       const           pg_u8x16_t mask = splat(0xFF >> n);
 
        return bitwise_and(shift16, mask);
 }
@@ -266,30 +268,30 @@ shift_right(const __m128i v, const int n)
  * The third argument to the intrinsic must be a numeric constant, so
  * we must have separate functions for different shift amounts.
  */
-static inline __m128i
-prev1(__m128i prev, __m128i input)
+static inline pg_u8x16_t
+prev1(pg_u8x16_t prev, pg_u8x16_t input)
 {
-       return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+       return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 1);
 }
 
-static inline __m128i
-prev2(__m128i prev, __m128i input)
+static inline pg_u8x16_t
+prev2(pg_u8x16_t prev, pg_u8x16_t input)
 {
-       return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+       return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 2);
 }
 
-static inline __m128i
-prev3(__m128i prev, __m128i input)
+static inline pg_u8x16_t
+prev3(pg_u8x16_t prev, pg_u8x16_t input)
 {
-       return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+       return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 3);
 }
 
 /*
  * For each 8-bit lane in the input, use that value as an index
  * into the lookup vector as if it were a 16-element byte array.
  */
-static inline __m128i
-lookup(const __m128i input, const __m128i lookup)
+static inline pg_u8x16_t
+lookup(const pg_u8x16_t input, const pg_u8x16_t lookup)
 {
        return _mm_shuffle_epi8(lookup, input);
 }
@@ -298,28 +300,28 @@ lookup(const __m128i input, const __m128i lookup)
  * Return a vector with lanes non-zero where we have either errors, or
  * two or more continuations in a row.
  */
-static inline __m128i
-check_special_cases(const __m128i prev, const __m128i input)
+static inline pg_u8x16_t
+check_special_cases(const pg_u8x16_t prev, const pg_u8x16_t input)
 {
-       const           __m128i byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
-       const           __m128i byte_1_low_table = vset(BYTE_1_LOW_TABLE);
-       const           __m128i byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
+       const           pg_u8x16_t byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
+       const           pg_u8x16_t byte_1_low_table = vset(BYTE_1_LOW_TABLE);
+       const           pg_u8x16_t byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
 
        /*
         * To classify the first byte in each chunk we need to have the last 
byte
         * from the previous chunk.
         */
-       const           __m128i input_shift1 = prev1(prev, input);
+       const           pg_u8x16_t input_shift1 = prev1(prev, input);
 
        /* put the relevant nibbles into their own bytes in their own registers 
*/
-       const           __m128i byte_1_high = shift_right(input_shift1, 4);
-       const           __m128i byte_1_low = bitwise_and(input_shift1, 
splat(0x0F));
-       const           __m128i byte_2_high = shift_right(input, 4);
+       const           pg_u8x16_t byte_1_high = shift_right(input_shift1, 4);
+       const           pg_u8x16_t byte_1_low = bitwise_and(input_shift1, 
splat(0x0F));
+       const           pg_u8x16_t byte_2_high = shift_right(input, 4);
 
        /* lookup the possible errors for each set of nibbles */
-       const           __m128i lookup_1_high = lookup(byte_1_high, 
byte_1_high_table);
-       const           __m128i lookup_1_low = lookup(byte_1_low, 
byte_1_low_table);
-       const           __m128i lookup_2_high = lookup(byte_2_high, 
byte_2_high_table);
+       const           pg_u8x16_t lookup_1_high = lookup(byte_1_high, 
byte_1_high_table);
+       const           pg_u8x16_t lookup_1_low = lookup(byte_1_low, 
byte_1_low_table);
+       const           pg_u8x16_t lookup_2_high = lookup(byte_2_high, 
byte_2_high_table);
 
        /*
         * AND all the lookups together. At this point, non-zero lanes in the
@@ -331,7 +333,7 @@ check_special_cases(const __m128i prev, const __m128i input)
         *
         * 3. the third continuation byte of a 4-byte character
         */
-       const           __m128i temp = bitwise_and(lookup_1_high, lookup_1_low);
+       const           pg_u8x16_t temp = bitwise_and(lookup_1_high, 
lookup_1_low);
 
        return bitwise_and(temp, lookup_2_high);
 }
@@ -340,22 +342,22 @@ check_special_cases(const __m128i prev, const __m128i 
input)
  * Return a vector with lanes set to TWO_CONTS where we expect to find two
  * continuations in a row. These are valid only within 3- and 4-byte sequences.
  */
-static inline __m128i
-check_multibyte_lengths(const __m128i prev, const __m128i input)
+static inline pg_u8x16_t
+check_multibyte_lengths(const pg_u8x16_t prev, const pg_u8x16_t input)
 {
        /*
         * Populate registers that contain the input shifted right by 2 and 3
         * bytes, filling in the left lanes from the previous input.
         */
-       const           __m128i input_shift2 = prev2(prev, input);
-       const           __m128i input_shift3 = prev3(prev, input);
+       const           pg_u8x16_t input_shift2 = prev2(prev, input);
+       const           pg_u8x16_t input_shift3 = prev3(prev, input);
 
        /*
         * Constants for comparison. Any 3-byte lead is greater than
         * MAX_TWO_BYTE_LEAD, etc.
         */
-       const           __m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD);
-       const           __m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD);
+       const           pg_u8x16_t max_lead2 = splat(MAX_TWO_BYTE_LEAD);
+       const           pg_u8x16_t max_lead3 = splat(MAX_THREE_BYTE_LEAD);
 
        /*
         * Look in the shifted registers for 3- or 4-byte leads. There is no
@@ -363,17 +365,17 @@ check_multibyte_lengths(const __m128i prev, const __m128i 
input)
         * signed comparison with zero. Any non-zero bytes in the result 
represent
         * valid leads.
         */
-       const           __m128i is_third_byte = saturating_sub(input_shift2, 
max_lead2);
-       const           __m128i is_fourth_byte = saturating_sub(input_shift3, 
max_lead3);
+       const           pg_u8x16_t is_third_byte = saturating_sub(input_shift2, 
max_lead2);
+       const           pg_u8x16_t is_fourth_byte = 
saturating_sub(input_shift3, max_lead3);
 
        /* OR them together for easier comparison */
-       const           __m128i temp = bitwise_or(is_third_byte, 
is_fourth_byte);
+       const           pg_u8x16_t temp = bitwise_or(is_third_byte, 
is_fourth_byte);
 
        /*
         * Set all bits in each 8-bit lane if the result is greater than zero.
         * Signed arithmetic is okay because the values are small.
         */
-       const           __m128i must23 = greater_than(temp, vzero());
+       const           pg_u8x16_t must23 = greater_than(temp, vzero());
 
        /*
         * We want to compare with the result of check_special_cases() so apply 
a
@@ -385,20 +387,20 @@ check_multibyte_lengths(const __m128i prev, const __m128i 
input)
 
 /* set bits in the error vector where we find invalid UTF-8 input */
 static inline void
-check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+check_utf8_bytes(const pg_u8x16_t prev, const pg_u8x16_t input, pg_u8x16_t * 
error)
 {
-       const           __m128i special_cases = check_special_cases(prev, 
input);
-       const           __m128i expect_two_conts = 
check_multibyte_lengths(prev, input);
+       const           pg_u8x16_t special_cases = check_special_cases(prev, 
input);
+       const           pg_u8x16_t expect_two_conts = 
check_multibyte_lengths(prev, input);
 
        /* If the two cases are identical, this will be zero. */
-       const           __m128i result = bitwise_xor(expect_two_conts, 
special_cases);
+       const           pg_u8x16_t result = bitwise_xor(expect_two_conts, 
special_cases);
 
        *error = bitwise_or(*error, result);
 }
 
 /* return false if a register is zero, true otherwise */
 static inline bool
-to_bool(const __m128i v)
+to_bool(const pg_u8x16_t v)
 {
        /*
         * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
@@ -409,25 +411,25 @@ to_bool(const __m128i v)
 
 /* set bits in the error vector where bytes in the input are zero */
 static inline void
-check_for_zeros(const __m128i v, __m128i * error)
+check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error)
 {
-       const           __m128i cmp = _mm_cmpeq_epi8(v, vzero());
+       const           pg_u8x16_t cmp = _mm_cmpeq_epi8(v, vzero());
 
        *error = bitwise_or(*error, cmp);
 }
 
 /* vector version of IS_HIGHBIT_SET() */
 static inline bool
-is_highbit_set(const __m128i v)
+is_highbit_set(const pg_u8x16_t v)
 {
        return _mm_movemask_epi8(v) != 0;
 }
 
 /* return non-zero if the input terminates with an incomplete code point */
-static inline __m128i
-is_incomplete(const __m128i v)
+static inline pg_u8x16_t
+is_incomplete(const pg_u8x16_t v)
 {
-       const           __m128i max_array =
+       const           pg_u8x16_t max_array =
        vset(0xFF, 0xFF, 0xFF, 0xFF,
                 0xFF, 0xFF, 0xFF, 0xFF,
                 0xFF, 0xFF, 0xFF, 0xFF,
@@ -440,20 +442,20 @@ is_incomplete(const __m128i v)
  * See the comment in common/wchar.c under "multibyte sequence validators".
  */
 int
-pg_validate_utf8_sse42(const unsigned char *s, int len)
+pg_validate_utf8_simd(const unsigned char *s, int len)
 {
        const unsigned char *start = s;
        const int       orig_len = len;
-       __m128i         error = vzero();
-       __m128i         prev = vzero();
-       __m128i         prev_incomplete = vzero();
-       __m128i         input;
+       pg_u8x16_t              error = vzero();
+       pg_u8x16_t              prev = vzero();
+       pg_u8x16_t              prev_incomplete = vzero();
+       pg_u8x16_t              input;
 
        /*
         * NB: This check must be strictly greater-than, otherwise an invalid 
byte
         * at the end might not get detected.
         */
-       while (len > sizeof(__m128i))
+       while (len > sizeof(pg_u8x16_t))
        {
                input = vload(s);
 
@@ -474,8 +476,8 @@ pg_validate_utf8_sse42(const unsigned char *s, int len)
                }
 
                prev = input;
-               s += sizeof(__m128i);
-               len -= sizeof(__m128i);
+               s += sizeof(pg_u8x16_t);
+               len -= sizeof(pg_u8x16_t);
        }
 
        /*
-- 
2.30.2

From ee96ffe76422bdd9e3d56ab2a2d56db3458cbf4d Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Thu, 22 Jul 2021 02:05:30 +1200
Subject: [PATCH 2/2] XXX Add ARM/NEON support for UTF-8 validation.

Needs configure checks.
Needs "choose" logic.  Probably a SIGILL test, as done elsewhere?
For now works only if you configure with USE_UTF8_SIMD=1.

XXX Experiment grade code only
---
 src/port/pg_utf8_simd.c | 102 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 99 insertions(+), 3 deletions(-)

diff --git a/src/port/pg_utf8_simd.c b/src/port/pg_utf8_simd.c
index 7ca9060e3a..f52d9c98d6 100644
--- a/src/port/pg_utf8_simd.c
+++ b/src/port/pg_utf8_simd.c
@@ -15,11 +15,27 @@
 
 #include "c.h"
 
+#if defined(__aarch64__)
+#define USE_NEON
+#elif defined(__x86_64__)
+#define USE_SSE
+#else
+#error "Unsupported architecture"
+#endif
+
+#if defined(USE_NEON)
+#include <arm_neon.h>
+#elif defined(USE_SSE)
 #include <nmmintrin.h>
+#endif
 
 #include "port/pg_utf8.h"
 
+#if defined(USE_NEON)
+typedef uint8x16_t pg_u8x16_t;
+#elif defined(USE_SSE)
 typedef __m128i pg_u8x16_t;
+#endif
 
 /*
  * This module is based on the paper "Validating UTF-8 In Less Than One
@@ -183,53 +199,95 @@ typedef __m128i pg_u8x16_t;
 
 /* helper functions to wrap intrinsics */
 
+#if defined(USE_NEON)
+static pg_attribute_always_inline pg_u8x16_t
+vset(uint8 v0, uint8 v1, uint8 v2, uint8 v3,
+        uint8 v4, uint8 v5, uint8 v6, uint8 v7,
+        uint8 v8, uint8 v9, uint8 v10, uint8 v11,
+        uint8 v12, uint8 v13, uint8 v14, uint8 v15)
+{
+       uint8 pg_attribute_aligned(16) values[16] = {
+               v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, 
v14, v15
+       };
+       return vld1q_u8(values);
+}
+#elif defined(USE_SSE)
 #define vset(...)              _mm_setr_epi8(__VA_ARGS__)
+#endif
 
 /* return a zeroed register */
 static inline const pg_u8x16_t
 vzero()
 {
+#if defined(USE_NEON)
+       return vmovq_n_u8(0);
+#elif defined(USE_SSE)
        return _mm_setzero_si128();
+#endif
 }
 
 /* perform an unaligned load from memory into a register */
 static inline const pg_u8x16_t
 vload(const unsigned char *raw_input)
 {
+#if defined(USE_NEON)
+       return vld1q_u8(raw_input);
+#elif defined(USE_SSE)
        return _mm_loadu_si128((const pg_u8x16_t *) raw_input);
+#endif
 }
 
 /* return a vector with each 8-bit lane populated with the input scalar */
 static inline pg_u8x16_t
 splat(char byte)
 {
+#if defined(USE_NEON)
+       return vdupq_n_u8((unsigned char) byte);
+#elif defined(USE_SSE)
        return _mm_set1_epi8(byte);
+#endif
 }
 
 /* perform signed greater-than on all 8-bit lanes */
 static inline pg_u8x16_t
 greater_than(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+       return vcgtq_s8((int8x16_t) v1, (int8x16_t) v2);
+#elif defined(USE_SSE)
        return _mm_cmpgt_epi8(v1, v2);
+#endif
 }
 
 /* bitwise vector operations */
 static inline pg_u8x16_t
 bitwise_and(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+       return vandq_u8(v1, v2);
+#elif defined(USE_SSE)
        return _mm_and_si128(v1, v2);
+#endif
 }
 
 static inline pg_u8x16_t
 bitwise_or(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+       return vorrq_u8(v1, v2);
+#elif defined(USE_SSE)
        return _mm_or_si128(v1, v2);
+#endif
 }
 
 static inline pg_u8x16_t
 bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+       return veorq_u8(v1, v2);
+#elif defined(USE_SSE)
        return _mm_xor_si128(v1, v2);
+#endif
 }
 
 /*
@@ -240,22 +298,32 @@ bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2)
 static inline pg_u8x16_t
 saturating_sub(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+       return vqsubq_u8(v1, v2);
+#elif defined(USE_SSE)
        return _mm_subs_epu8(v1, v2);
+#endif
 }
 
 /*
  * Shift right each 8-bit lane
- *
- * There is no intrinsic to do this on 8-bit lanes, so shift right in each
- * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
  */
 static inline pg_u8x16_t
 shift_right(const pg_u8x16_t v, const int n)
 {
+#if defined(USE_NEON)
+       return vshrq_n_u8(v, n);
+#elif defined(USE_SSE)
+       /*
+        * There is no intrinsic to do this on 8-bit lanes, so shift right in 
each
+        * 16-bit lane then apply a mask in each 8-bit lane shifted the same
+        * amount.
+        */
        const           pg_u8x16_t shift16 = _mm_srli_epi16(v, n);
        const           pg_u8x16_t mask = splat(0xFF >> n);
 
        return bitwise_and(shift16, mask);
+#endif
 }
 
 /*
@@ -271,19 +339,31 @@ shift_right(const pg_u8x16_t v, const int n)
 static inline pg_u8x16_t
 prev1(pg_u8x16_t prev, pg_u8x16_t input)
 {
+#if defined(USE_NEON)
+       return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 1);
+#elif defined(USE_SSE)
        return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 1);
+#endif
 }
 
 static inline pg_u8x16_t
 prev2(pg_u8x16_t prev, pg_u8x16_t input)
 {
+#if defined(USE_NEON)
+       return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 2);
+#elif defined(USE_SSE)
        return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 2);
+#endif
 }
 
 static inline pg_u8x16_t
 prev3(pg_u8x16_t prev, pg_u8x16_t input)
 {
+#if defined(USE_NEON)
+       return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 3);
+#elif defined(USE_SSE)
        return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 3);
+#endif
 }
 
 /*
@@ -293,7 +373,11 @@ prev3(pg_u8x16_t prev, pg_u8x16_t input)
 static inline pg_u8x16_t
 lookup(const pg_u8x16_t input, const pg_u8x16_t lookup)
 {
+#if defined(USE_NEON)
+       return vqtbl1q_u8(lookup, input);
+#elif defined(USE_SSE)
        return _mm_shuffle_epi8(lookup, input);
+#endif
 }
 
 /*
@@ -402,18 +486,26 @@ check_utf8_bytes(const pg_u8x16_t prev, const pg_u8x16_t 
input, pg_u8x16_t * err
 static inline bool
 to_bool(const pg_u8x16_t v)
 {
+#if defined(USE_NEON)
+       return vmaxvq_u32((uint32x4_t) v) != 0;
+#elif defined(USE_SSE)
        /*
         * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
         * zero. Zero is the only value whose bitwise AND with itself is zero.
         */
        return !_mm_testz_si128(v, v);
+#endif
 }
 
 /* set bits in the error vector where bytes in the input are zero */
 static inline void
 check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error)
 {
+#if defined(USE_NEON)
+       const           pg_u8x16_t cmp = vceqq_u8(v, vzero());
+#elif defined(USE_SSE)
        const           pg_u8x16_t cmp = _mm_cmpeq_epi8(v, vzero());
+#endif
 
        *error = bitwise_or(*error, cmp);
 }
@@ -422,7 +514,11 @@ check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error)
 static inline bool
 is_highbit_set(const pg_u8x16_t v)
 {
+#if defined(USE_NEON)
+       return to_bool(bitwise_and(v, vmovq_n_u8(0x80)));
+#elif defined(USE_SSE)
        return _mm_movemask_epi8(v) != 0;
+#endif
 }
 
 /* return non-zero if the input terminates with an incomplete code point */
-- 
2.30.2

Reply via email to