On Thu, Jul 22, 2021 at 6:16 AM John Naylor
<john.nay...@enterprisedb.com> wrote:
> Neat! It's good to make it more architecture-agnostic, and I'm sure we can 
> use quite a bit of this.

One question is whether this "one size fits all" approach will be
extensible to wider SIMD.

>  to_bool(const pg_u8x16_t v)
>  {
> +#if defined(USE_NEON)
> + return vmaxvq_u32((uint32x4_t) v) != 0;
>
> --> return vmaxvq_u8(*this) != 0;

I chose that lane width because I saw an unsubstantiated claim
somewhere that it might be faster, but I have no idea if it matters.
The u8 code looks more natural anyway.  Changed.

>  vzero()
>  {
> +#if defined(USE_NEON)
> + return vmovq_n_u8(0);
>
> --> return vdupq_n_u8(0); // or equivalently, splat(0)

I guess it doesn't make a difference which builtin you use here, but I
was influenced by the ARM manual which says the vdupq form is
generated for immediate values.

> is_highbit_set(const pg_u8x16_t v)
>  {
> +#if defined(USE_NEON)
> + return to_bool(bitwise_and(v, vmovq_n_u8(0x80)));
>
> --> return vmaxq_u8(v) > 0x7F

Ah, of course.  Much nicer!

> +#if defined(USE_NEON)
> +static pg_attribute_always_inline pg_u8x16_t
> +vset(uint8 v0, uint8 v1, uint8 v2, uint8 v3,
> + uint8 v4, uint8 v5, uint8 v6, uint8 v7,
> + uint8 v8, uint8 v9, uint8 v10, uint8 v11,
> + uint8 v12, uint8 v13, uint8 v14, uint8 v15)
> +{
> + uint8 pg_attribute_aligned(16) values[16] = {
> + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
> + };
> + return vld1q_u8(values);
> +}
>
> --> They have this strange beast instead:
>
>   // Doing a load like so end ups generating worse code.
>   // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
>   //                     x9, x10,x11,x12,x13,x14,x15,x16};
>   // return vld1q_u8(array);
>   uint8x16_t x{};
>   // incredibly, Visual Studio does not allow x[0] = x1
>   x = vsetq_lane_u8(x1, x, 0);
>   x = vsetq_lane_u8(x2, x, 1);
>   x = vsetq_lane_u8(x3, x, 2);
> ...
>   x = vsetq_lane_u8(x15, x, 14);
>   x = vsetq_lane_u8(x16, x, 15);
>   return x;
>
> Since you aligned the array, that might not have the problem alluded to 
> above, and it looks nicer.

Strange indeed.  We should probably poke around in the assember and
see... it might be that MSVC doesn't like it, and I was just
cargo-culting the alignment.  I don't expect the generated code to
really "load" anything of course, it should ideally be some kind of
immediate mov...

FWIW here are some performance results from my humble RPI4:

master:

 chinese | mixed | ascii
---------+-------+-------
    4172 |  2763 |  1823
(1 row)

Your v15 patch:

 chinese | mixed | ascii
---------+-------+-------
    2267 |  1248 |   399
(1 row)

Your v15 patch set + the NEON patch, configured with USE_UTF8_SIMD=1:

 chinese | mixed | ascii
---------+-------+-------
     909 |   620 |   318
(1 row)

It's so good I wonder if it's producing incorrect results :-)

I also tried to do a quick and dirty AltiVec patch to see if it could
fit into the same code "shape", with less immediate success: it works
out slower than the fallback code on the POWER7 machine I scrounged an
account on.  I'm not sure what's wrong there, but maybe it's a uesful
start (I'm probably confused about endianness, or the encoding of
boolean vectors which may be different (is true 0x01or 0xff, does it
matter?), or something else, and it's falling back on errors all the
time?).
From 1464c22da33117900341496d03b92dec5be2a62a Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Thu, 22 Jul 2021 02:05:06 +1200
Subject: [PATCH v2 1/3] XXX Make SIMD code more platform neutral.

Move SIMD code into pg_utf_simd.c to experiment with the idea of a
shared implementation across architectures.  Introduce pg_u8x16_t to
abstract vector type.

XXX Experiment grade code only
---
 configure                                     |  18 +--
 configure.ac                                  |  18 +--
 src/include/pg_config.h.in                    |  14 +-
 src/include/port/pg_utf8.h                    |  10 +-
 src/port/Makefile                             |   8 +-
 ...g_utf8_sse42_choose.c => pg_utf8_choose.c} |  10 +-
 src/port/{pg_utf8_sse42.c => pg_utf8_simd.c}  | 148 +++++++++---------
 7 files changed, 114 insertions(+), 112 deletions(-)
 rename src/port/{pg_utf8_sse42_choose.c => pg_utf8_choose.c} (88%)
 rename src/port/{pg_utf8_sse42.c => pg_utf8_simd.c} (76%)

diff --git a/configure b/configure
index 30969840b1..df546b641c 100755
--- a/configure
+++ b/configure
@@ -18442,13 +18442,13 @@ fi
 #
 # You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
 # in the template or configure command line.
-if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" 
= x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+if test x"$USE_SIMD_UTF8" = x"" && test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = 
x"" && test x"$USE_FALLBACK_UTF8" = x""; then
   if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = 
x"1" ; then
-    USE_SSE42_UTF8=1
+    USE_SIMD_UTF8=1
   else
     # the CPUID instruction is needed for the runtime check.
     if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" 
= x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
-      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+      USE_SIMD_UTF8_WITH_RUNTIME_CHECK=1
     else
       # fall back to algorithm which doesn't require any special
       # CPU support.
@@ -18461,19 +18461,19 @@ fi
 # Note: We need the fallback for error handling in all builds.
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to 
use" >&5
 $as_echo_n "checking which UTF-8 validator to use... " >&6; }
-if test x"$USE_SSE42_UTF8" = x"1"; then
+if test x"$USE_SIMD_UTF8" = x"1"; then
 
-$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+$as_echo "#define USE_SIMD_UTF8 1" >>confdefs.h
 
-  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o"
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
 $as_echo "SSE 4.2" >&6; }
 else
-  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+  if test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
 
-$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+$as_echo "#define USE_SIMD_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
 
-    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o pg_utf8_choose.o"
     { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime 
check" >&5
 $as_echo "SSE 4.2 with runtime check" >&6; }
   else
diff --git a/configure.ac b/configure.ac
index 5e2b4717c1..1606a80fb7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2217,13 +2217,13 @@ AC_SUBST(PG_CRC32C_OBJS)
 #
 # You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
 # in the template or configure command line.
-if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" 
= x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+if test x"$USE_SIMD_UTF8" = x"" && test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = 
x"" && test x"$USE_FALLBACK_UTF8" = x""; then
   if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = 
x"1" ; then
-    USE_SSE42_UTF8=1
+    USE_SIMD_UTF8=1
   else
     # the CPUID instruction is needed for the runtime check.
     if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" 
= x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
-      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+      USE_SIMD_UTF8_WITH_RUNTIME_CHECK=1
     else
       # fall back to algorithm which doesn't require any special
       # CPU support.
@@ -2235,14 +2235,14 @@ fi
 # Set PG_UTF8_OBJS appropriately depending on the selected implementation.
 # Note: We need the fallback for error handling in all builds.
 AC_MSG_CHECKING([which UTF-8 validator to use])
-if test x"$USE_SSE42_UTF8" = x"1"; then
-  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
-  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+if test x"$USE_SIMD_UTF8" = x"1"; then
+  AC_DEFINE(USE_SIMD_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o"
   AC_MSG_RESULT(SSE 4.2)
 else
-  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
-    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel 
SSE 4.2 instructions with a runtime check.])
-    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+  if test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SIMD_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel 
SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o pg_utf8_choose.o"
     AC_MSG_RESULT(SSE 4.2 with runtime check)
   else
     AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 
instructions with a runtime check.])
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 9d5e1efda9..f1456553e6 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -904,6 +904,9 @@
 /* Define to 1 to build with BSD Authentication support. (--with-bsd-auth) */
 #undef USE_BSD_AUTH
 
+/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */
+#undef USE_FALLBACK_UTF8
+
 /* Define to build with ICU support. (--with-icu) */
 #undef USE_ICU
 
@@ -932,14 +935,11 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
-/* Define to 1 to use the fallback UTF-8 validator written in C. */
-#undef USE_FALLBACK_UTF8
-
-/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
-#undef USE_SSE42_UTF8
+/* Define to 1 use Intel SSE 4.2 instructions. */
+#undef USE_SIMD_UTF8
 
-/* Define to 1 use the UTF-8 validator written with Intel SSE instructions 
with runtime check. */
-#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */
+#undef USE_SIMD_UTF8_WITH_RUNTIME_CHECK
 
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
index dc38369a31..a9f2b9f15b 100644
--- a/src/include/port/pg_utf8.h
+++ b/src/include/port/pg_utf8.h
@@ -15,14 +15,14 @@
 #define PG_UTF8_H
 
 
-#if defined(USE_SSE42_UTF8)
+#if defined(USE_SIMD_UTF8)
 /* Use Intel SSE4.2 instructions. */
 #define UTF8_VERIFYSTR(s, len) \
-       pg_validate_utf8_sse42((s), (len))
+       pg_validate_utf8_simd((s), (len))
 
-extern int     pg_validate_utf8_sse42(const unsigned char *s, int len);
+extern int     pg_validate_utf8_simd(const unsigned char *s, int len);
 
-#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+#elif defined(USE_SIMD_UTF8_WITH_RUNTIME_CHECK)
 /*
  * Use Intel SSE 4.2 instructions, but perform a runtime check first
  * to check that they are available.
@@ -31,7 +31,7 @@ extern int    pg_validate_utf8_sse42(const unsigned char *s, 
int len);
        pg_validate_utf8((s), (len))
 
 extern int     (*pg_validate_utf8) (const unsigned char *s, int len);
-extern int     pg_validate_utf8_sse42(const unsigned char *s, int len);
+extern int     pg_validate_utf8_simd(const unsigned char *s, int len);
 
 #else
 #define UTF8_VERIFYSTR(s, len) \
diff --git a/src/port/Makefile b/src/port/Makefile
index 04838b0ab2..893fcd7d59 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -90,10 +90,10 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
-# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
-pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
-pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
-pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+# all versions of pg_utf8_simd.o need CFLAGS_SSE42
+pg_utf8_simd.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_simd_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_simd_srv.o: CFLAGS+=$(CFLAGS_SSE42)
 
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_choose.c
similarity index 88%
rename from src/port/pg_utf8_sse42_choose.c
rename to src/port/pg_utf8_choose.c
index ff6120be2b..140c0dce7b 100644
--- a/src/port/pg_utf8_sse42_choose.c
+++ b/src/port/pg_utf8_choose.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
- * pg_utf8_sse42_choose.c
- *       Choose between Intel SSE 4.2 and fallback implementation.
+ * pg_utf8_choose.c
+ *       Choose between SSE 4.2 and fallback implementation.
  *
  * On first call, checks if the CPU we're running on supports Intel SSE
  * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
@@ -30,7 +30,7 @@
 #include "port/pg_utf8.h"
 
 static bool
-pg_utf8_sse42_available(void)
+pg_utf8_simd_available(void)
 {
        /* To save from checking every SSE2 intrinsic, insist on 64-bit. */
 #ifdef __x86_64__
@@ -57,8 +57,8 @@ pg_utf8_sse42_available(void)
 static int
 pg_validate_utf8_choose(const unsigned char *s, int len)
 {
-       if (pg_utf8_sse42_available())
-               pg_validate_utf8 = pg_validate_utf8_sse42;
+       if (pg_utf8_simd_available())
+               pg_validate_utf8 = pg_validate_utf8_simd;
        else
                pg_validate_utf8 = pg_validate_utf8_fallback;
 
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_simd.c
similarity index 76%
rename from src/port/pg_utf8_sse42.c
rename to src/port/pg_utf8_simd.c
index cd050ec2bf..7ca9060e3a 100644
--- a/src/port/pg_utf8_sse42.c
+++ b/src/port/pg_utf8_simd.c
@@ -1,6 +1,6 @@
 /*-------------------------------------------------------------------------
  *
- * pg_utf8_sse42.c
+ * pg_utf8_simd.c
  *       Validate UTF-8 using Intel SSE 4.2 instructions.
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       src/port/pg_utf8_sse42.c
+ *       src/port/pg_utf8_simd.c
  *
  *-------------------------------------------------------------------------
  */
@@ -19,6 +19,8 @@
 
 #include "port/pg_utf8.h"
 
+typedef __m128i pg_u8x16_t;
+
 /*
  * This module is based on the paper "Validating UTF-8 In Less Than One
  * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090
@@ -184,48 +186,48 @@
 #define vset(...)              _mm_setr_epi8(__VA_ARGS__)
 
 /* return a zeroed register */
-static inline const __m128i
+static inline const pg_u8x16_t
 vzero()
 {
        return _mm_setzero_si128();
 }
 
 /* perform an unaligned load from memory into a register */
-static inline const __m128i
+static inline const pg_u8x16_t
 vload(const unsigned char *raw_input)
 {
-       return _mm_loadu_si128((const __m128i *) raw_input);
+       return _mm_loadu_si128((const pg_u8x16_t *) raw_input);
 }
 
 /* return a vector with each 8-bit lane populated with the input scalar */
-static inline __m128i
+static inline pg_u8x16_t
 splat(char byte)
 {
        return _mm_set1_epi8(byte);
 }
 
 /* perform signed greater-than on all 8-bit lanes */
-static inline __m128i
-greater_than(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+greater_than(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
        return _mm_cmpgt_epi8(v1, v2);
 }
 
 /* bitwise vector operations */
-static inline __m128i
-bitwise_and(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+bitwise_and(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
        return _mm_and_si128(v1, v2);
 }
 
-static inline __m128i
-bitwise_or(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+bitwise_or(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
        return _mm_or_si128(v1, v2);
 }
 
-static inline __m128i
-bitwise_xor(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
        return _mm_xor_si128(v1, v2);
 }
@@ -235,8 +237,8 @@ bitwise_xor(const __m128i v1, const __m128i v2)
  * on overflow, stop at zero. Useful for emulating unsigned
  * comparison.
  */
-static inline __m128i
-saturating_sub(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+saturating_sub(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
        return _mm_subs_epu8(v1, v2);
 }
@@ -247,11 +249,11 @@ saturating_sub(const __m128i v1, const __m128i v2)
  * There is no intrinsic to do this on 8-bit lanes, so shift right in each
  * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
  */
-static inline __m128i
-shift_right(const __m128i v, const int n)
+static inline pg_u8x16_t
+shift_right(const pg_u8x16_t v, const int n)
 {
-       const           __m128i shift16 = _mm_srli_epi16(v, n);
-       const           __m128i mask = splat(0xFF >> n);
+       const           pg_u8x16_t shift16 = _mm_srli_epi16(v, n);
+       const           pg_u8x16_t mask = splat(0xFF >> n);
 
        return bitwise_and(shift16, mask);
 }
@@ -266,30 +268,30 @@ shift_right(const __m128i v, const int n)
  * The third argument to the intrinsic must be a numeric constant, so
  * we must have separate functions for different shift amounts.
  */
-static inline __m128i
-prev1(__m128i prev, __m128i input)
+static inline pg_u8x16_t
+prev1(pg_u8x16_t prev, pg_u8x16_t input)
 {
-       return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+       return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 1);
 }
 
-static inline __m128i
-prev2(__m128i prev, __m128i input)
+static inline pg_u8x16_t
+prev2(pg_u8x16_t prev, pg_u8x16_t input)
 {
-       return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+       return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 2);
 }
 
-static inline __m128i
-prev3(__m128i prev, __m128i input)
+static inline pg_u8x16_t
+prev3(pg_u8x16_t prev, pg_u8x16_t input)
 {
-       return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+       return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 3);
 }
 
 /*
  * For each 8-bit lane in the input, use that value as an index
  * into the lookup vector as if it were a 16-element byte array.
  */
-static inline __m128i
-lookup(const __m128i input, const __m128i lookup)
+static inline pg_u8x16_t
+lookup(const pg_u8x16_t input, const pg_u8x16_t lookup)
 {
        return _mm_shuffle_epi8(lookup, input);
 }
@@ -298,28 +300,28 @@ lookup(const __m128i input, const __m128i lookup)
  * Return a vector with lanes non-zero where we have either errors, or
  * two or more continuations in a row.
  */
-static inline __m128i
-check_special_cases(const __m128i prev, const __m128i input)
+static inline pg_u8x16_t
+check_special_cases(const pg_u8x16_t prev, const pg_u8x16_t input)
 {
-       const           __m128i byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
-       const           __m128i byte_1_low_table = vset(BYTE_1_LOW_TABLE);
-       const           __m128i byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
+       const           pg_u8x16_t byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
+       const           pg_u8x16_t byte_1_low_table = vset(BYTE_1_LOW_TABLE);
+       const           pg_u8x16_t byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
 
        /*
         * To classify the first byte in each chunk we need to have the last 
byte
         * from the previous chunk.
         */
-       const           __m128i input_shift1 = prev1(prev, input);
+       const           pg_u8x16_t input_shift1 = prev1(prev, input);
 
        /* put the relevant nibbles into their own bytes in their own registers 
*/
-       const           __m128i byte_1_high = shift_right(input_shift1, 4);
-       const           __m128i byte_1_low = bitwise_and(input_shift1, 
splat(0x0F));
-       const           __m128i byte_2_high = shift_right(input, 4);
+       const           pg_u8x16_t byte_1_high = shift_right(input_shift1, 4);
+       const           pg_u8x16_t byte_1_low = bitwise_and(input_shift1, 
splat(0x0F));
+       const           pg_u8x16_t byte_2_high = shift_right(input, 4);
 
        /* lookup the possible errors for each set of nibbles */
-       const           __m128i lookup_1_high = lookup(byte_1_high, 
byte_1_high_table);
-       const           __m128i lookup_1_low = lookup(byte_1_low, 
byte_1_low_table);
-       const           __m128i lookup_2_high = lookup(byte_2_high, 
byte_2_high_table);
+       const           pg_u8x16_t lookup_1_high = lookup(byte_1_high, 
byte_1_high_table);
+       const           pg_u8x16_t lookup_1_low = lookup(byte_1_low, 
byte_1_low_table);
+       const           pg_u8x16_t lookup_2_high = lookup(byte_2_high, 
byte_2_high_table);
 
        /*
         * AND all the lookups together. At this point, non-zero lanes in the
@@ -331,7 +333,7 @@ check_special_cases(const __m128i prev, const __m128i input)
         *
         * 3. the third continuation byte of a 4-byte character
         */
-       const           __m128i temp = bitwise_and(lookup_1_high, lookup_1_low);
+       const           pg_u8x16_t temp = bitwise_and(lookup_1_high, 
lookup_1_low);
 
        return bitwise_and(temp, lookup_2_high);
 }
@@ -340,22 +342,22 @@ check_special_cases(const __m128i prev, const __m128i 
input)
  * Return a vector with lanes set to TWO_CONTS where we expect to find two
  * continuations in a row. These are valid only within 3- and 4-byte sequences.
  */
-static inline __m128i
-check_multibyte_lengths(const __m128i prev, const __m128i input)
+static inline pg_u8x16_t
+check_multibyte_lengths(const pg_u8x16_t prev, const pg_u8x16_t input)
 {
        /*
         * Populate registers that contain the input shifted right by 2 and 3
         * bytes, filling in the left lanes from the previous input.
         */
-       const           __m128i input_shift2 = prev2(prev, input);
-       const           __m128i input_shift3 = prev3(prev, input);
+       const           pg_u8x16_t input_shift2 = prev2(prev, input);
+       const           pg_u8x16_t input_shift3 = prev3(prev, input);
 
        /*
         * Constants for comparison. Any 3-byte lead is greater than
         * MAX_TWO_BYTE_LEAD, etc.
         */
-       const           __m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD);
-       const           __m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD);
+       const           pg_u8x16_t max_lead2 = splat(MAX_TWO_BYTE_LEAD);
+       const           pg_u8x16_t max_lead3 = splat(MAX_THREE_BYTE_LEAD);
 
        /*
         * Look in the shifted registers for 3- or 4-byte leads. There is no
@@ -363,17 +365,17 @@ check_multibyte_lengths(const __m128i prev, const __m128i 
input)
         * signed comparison with zero. Any non-zero bytes in the result 
represent
         * valid leads.
         */
-       const           __m128i is_third_byte = saturating_sub(input_shift2, 
max_lead2);
-       const           __m128i is_fourth_byte = saturating_sub(input_shift3, 
max_lead3);
+       const           pg_u8x16_t is_third_byte = saturating_sub(input_shift2, 
max_lead2);
+       const           pg_u8x16_t is_fourth_byte = 
saturating_sub(input_shift3, max_lead3);
 
        /* OR them together for easier comparison */
-       const           __m128i temp = bitwise_or(is_third_byte, 
is_fourth_byte);
+       const           pg_u8x16_t temp = bitwise_or(is_third_byte, 
is_fourth_byte);
 
        /*
         * Set all bits in each 8-bit lane if the result is greater than zero.
         * Signed arithmetic is okay because the values are small.
         */
-       const           __m128i must23 = greater_than(temp, vzero());
+       const           pg_u8x16_t must23 = greater_than(temp, vzero());
 
        /*
         * We want to compare with the result of check_special_cases() so apply 
a
@@ -385,20 +387,20 @@ check_multibyte_lengths(const __m128i prev, const __m128i 
input)
 
 /* set bits in the error vector where we find invalid UTF-8 input */
 static inline void
-check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+check_utf8_bytes(const pg_u8x16_t prev, const pg_u8x16_t input, pg_u8x16_t * 
error)
 {
-       const           __m128i special_cases = check_special_cases(prev, 
input);
-       const           __m128i expect_two_conts = 
check_multibyte_lengths(prev, input);
+       const           pg_u8x16_t special_cases = check_special_cases(prev, 
input);
+       const           pg_u8x16_t expect_two_conts = 
check_multibyte_lengths(prev, input);
 
        /* If the two cases are identical, this will be zero. */
-       const           __m128i result = bitwise_xor(expect_two_conts, 
special_cases);
+       const           pg_u8x16_t result = bitwise_xor(expect_two_conts, 
special_cases);
 
        *error = bitwise_or(*error, result);
 }
 
 /* return false if a register is zero, true otherwise */
 static inline bool
-to_bool(const __m128i v)
+to_bool(const pg_u8x16_t v)
 {
        /*
         * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
@@ -409,25 +411,25 @@ to_bool(const __m128i v)
 
 /* set bits in the error vector where bytes in the input are zero */
 static inline void
-check_for_zeros(const __m128i v, __m128i * error)
+check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error)
 {
-       const           __m128i cmp = _mm_cmpeq_epi8(v, vzero());
+       const           pg_u8x16_t cmp = _mm_cmpeq_epi8(v, vzero());
 
        *error = bitwise_or(*error, cmp);
 }
 
 /* vector version of IS_HIGHBIT_SET() */
 static inline bool
-is_highbit_set(const __m128i v)
+is_highbit_set(const pg_u8x16_t v)
 {
        return _mm_movemask_epi8(v) != 0;
 }
 
 /* return non-zero if the input terminates with an incomplete code point */
-static inline __m128i
-is_incomplete(const __m128i v)
+static inline pg_u8x16_t
+is_incomplete(const pg_u8x16_t v)
 {
-       const           __m128i max_array =
+       const           pg_u8x16_t max_array =
        vset(0xFF, 0xFF, 0xFF, 0xFF,
                 0xFF, 0xFF, 0xFF, 0xFF,
                 0xFF, 0xFF, 0xFF, 0xFF,
@@ -440,20 +442,20 @@ is_incomplete(const __m128i v)
  * See the comment in common/wchar.c under "multibyte sequence validators".
  */
 int
-pg_validate_utf8_sse42(const unsigned char *s, int len)
+pg_validate_utf8_simd(const unsigned char *s, int len)
 {
        const unsigned char *start = s;
        const int       orig_len = len;
-       __m128i         error = vzero();
-       __m128i         prev = vzero();
-       __m128i         prev_incomplete = vzero();
-       __m128i         input;
+       pg_u8x16_t              error = vzero();
+       pg_u8x16_t              prev = vzero();
+       pg_u8x16_t              prev_incomplete = vzero();
+       pg_u8x16_t              input;
 
        /*
         * NB: This check must be strictly greater-than, otherwise an invalid 
byte
         * at the end might not get detected.
         */
-       while (len > sizeof(__m128i))
+       while (len > sizeof(pg_u8x16_t))
        {
                input = vload(s);
 
@@ -474,8 +476,8 @@ pg_validate_utf8_sse42(const unsigned char *s, int len)
                }
 
                prev = input;
-               s += sizeof(__m128i);
-               len -= sizeof(__m128i);
+               s += sizeof(pg_u8x16_t);
+               len -= sizeof(pg_u8x16_t);
        }
 
        /*
-- 
2.30.2

From 8f2b0f0dfe70699695f9a3a64b4c737201791da0 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Thu, 22 Jul 2021 02:05:30 +1200
Subject: [PATCH v2 2/3] XXX Add ARM/NEON support for UTF-8 validation.

Needs configure checks.
Needs "choose" logic.  Probably a SIGILL test, as done elsewhere?
For now works only if you configure with USE_UTF8_SIMD=1.

XXX Experiment grade code only
---
 src/port/pg_utf8_simd.c | 102 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 99 insertions(+), 3 deletions(-)

diff --git a/src/port/pg_utf8_simd.c b/src/port/pg_utf8_simd.c
index 7ca9060e3a..db0fe61b93 100644
--- a/src/port/pg_utf8_simd.c
+++ b/src/port/pg_utf8_simd.c
@@ -15,11 +15,27 @@
 
 #include "c.h"
 
+#if defined(__aarch64__)
+#define USE_NEON
+#elif defined(__x86_64__)
+#define USE_SSE
+#else
+#error "Unsupported architecture"
+#endif
+
+#if defined(USE_NEON)
+#include <arm_neon.h>
+#elif defined(USE_SSE)
 #include <nmmintrin.h>
+#endif
 
 #include "port/pg_utf8.h"
 
+#if defined(USE_NEON)
+typedef uint8x16_t pg_u8x16_t;
+#elif defined(USE_SSE)
 typedef __m128i pg_u8x16_t;
+#endif
 
 /*
  * This module is based on the paper "Validating UTF-8 In Less Than One
@@ -183,53 +199,95 @@ typedef __m128i pg_u8x16_t;
 
 /* helper functions to wrap intrinsics */
 
+#if defined(USE_NEON)
+static pg_attribute_always_inline pg_u8x16_t
+vset(uint8 v0, uint8 v1, uint8 v2, uint8 v3,
+        uint8 v4, uint8 v5, uint8 v6, uint8 v7,
+        uint8 v8, uint8 v9, uint8 v10, uint8 v11,
+        uint8 v12, uint8 v13, uint8 v14, uint8 v15)
+{
+       uint8 pg_attribute_aligned(16) values[16] = {
+               v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, 
v14, v15
+       };
+       return vld1q_u8(values);
+}
+#elif defined(USE_SSE)
 #define vset(...)              _mm_setr_epi8(__VA_ARGS__)
+#endif
 
 /* return a zeroed register */
 static inline const pg_u8x16_t
 vzero()
 {
+#if defined(USE_NEON)
+       return vmovq_n_u8(0);
+#elif defined(USE_SSE)
        return _mm_setzero_si128();
+#endif
 }
 
 /* perform an unaligned load from memory into a register */
 static inline const pg_u8x16_t
 vload(const unsigned char *raw_input)
 {
+#if defined(USE_NEON)
+       return vld1q_u8(raw_input);
+#elif defined(USE_SSE)
        return _mm_loadu_si128((const pg_u8x16_t *) raw_input);
+#endif
 }
 
 /* return a vector with each 8-bit lane populated with the input scalar */
 static inline pg_u8x16_t
 splat(char byte)
 {
+#if defined(USE_NEON)
+       return vdupq_n_u8((unsigned char) byte);
+#elif defined(USE_SSE)
        return _mm_set1_epi8(byte);
+#endif
 }
 
 /* perform signed greater-than on all 8-bit lanes */
 static inline pg_u8x16_t
 greater_than(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+       return vcgtq_s8((int8x16_t) v1, (int8x16_t) v2);
+#elif defined(USE_SSE)
        return _mm_cmpgt_epi8(v1, v2);
+#endif
 }
 
 /* bitwise vector operations */
 static inline pg_u8x16_t
 bitwise_and(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+       return vandq_u8(v1, v2);
+#elif defined(USE_SSE)
        return _mm_and_si128(v1, v2);
+#endif
 }
 
 static inline pg_u8x16_t
 bitwise_or(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+       return vorrq_u8(v1, v2);
+#elif defined(USE_SSE)
        return _mm_or_si128(v1, v2);
+#endif
 }
 
 static inline pg_u8x16_t
 bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+       return veorq_u8(v1, v2);
+#elif defined(USE_SSE)
        return _mm_xor_si128(v1, v2);
+#endif
 }
 
 /*
@@ -240,22 +298,32 @@ bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2)
 static inline pg_u8x16_t
 saturating_sub(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
+#if defined(USE_NEON)
+       return vqsubq_u8(v1, v2);
+#elif defined(USE_SSE)
        return _mm_subs_epu8(v1, v2);
+#endif
 }
 
 /*
  * Shift right each 8-bit lane
- *
- * There is no intrinsic to do this on 8-bit lanes, so shift right in each
- * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
  */
 static inline pg_u8x16_t
 shift_right(const pg_u8x16_t v, const int n)
 {
+#if defined(USE_NEON)
+       return vshrq_n_u8(v, n);
+#elif defined(USE_SSE)
+       /*
+        * There is no intrinsic to do this on 8-bit lanes, so shift right in 
each
+        * 16-bit lane then apply a mask in each 8-bit lane shifted the same
+        * amount.
+        */
        const           pg_u8x16_t shift16 = _mm_srli_epi16(v, n);
        const           pg_u8x16_t mask = splat(0xFF >> n);
 
        return bitwise_and(shift16, mask);
+#endif
 }
 
 /*
@@ -271,19 +339,31 @@ shift_right(const pg_u8x16_t v, const int n)
 static inline pg_u8x16_t
 prev1(pg_u8x16_t prev, pg_u8x16_t input)
 {
+#if defined(USE_NEON)
+       return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 1);
+#elif defined(USE_SSE)
        return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 1);
+#endif
 }
 
 static inline pg_u8x16_t
 prev2(pg_u8x16_t prev, pg_u8x16_t input)
 {
+#if defined(USE_NEON)
+       return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 2);
+#elif defined(USE_SSE)
        return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 2);
+#endif
 }
 
 static inline pg_u8x16_t
 prev3(pg_u8x16_t prev, pg_u8x16_t input)
 {
+#if defined(USE_NEON)
+       return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 3);
+#elif defined(USE_SSE)
        return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 3);
+#endif
 }
 
 /*
@@ -293,7 +373,11 @@ prev3(pg_u8x16_t prev, pg_u8x16_t input)
 static inline pg_u8x16_t
 lookup(const pg_u8x16_t input, const pg_u8x16_t lookup)
 {
+#if defined(USE_NEON)
+       return vqtbl1q_u8(lookup, input);
+#elif defined(USE_SSE)
        return _mm_shuffle_epi8(lookup, input);
+#endif
 }
 
 /*
@@ -402,18 +486,26 @@ check_utf8_bytes(const pg_u8x16_t prev, const pg_u8x16_t 
input, pg_u8x16_t * err
 static inline bool
 to_bool(const pg_u8x16_t v)
 {
+#if defined(USE_NEON)
+       return vmaxvq_u8(v) != 0;
+#elif defined(USE_SSE)
        /*
         * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
         * zero. Zero is the only value whose bitwise AND with itself is zero.
         */
        return !_mm_testz_si128(v, v);
+#endif
 }
 
 /* set bits in the error vector where bytes in the input are zero */
 static inline void
 check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error)
 {
+#if defined(USE_NEON)
+       const           pg_u8x16_t cmp = vceqq_u8(v, vzero());
+#elif defined(USE_SSE)
        const           pg_u8x16_t cmp = _mm_cmpeq_epi8(v, vzero());
+#endif
 
        *error = bitwise_or(*error, cmp);
 }
@@ -422,7 +514,11 @@ check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error)
 static inline bool
 is_highbit_set(const pg_u8x16_t v)
 {
+#if defined(USE_NEON)
+       return vmaxvq_u8(v) > 0x7F;
+#elif defined(USE_SSE)
        return _mm_movemask_epi8(v) != 0;
+#endif
 }
 
 /* return non-zero if the input terminates with an incomplete code point */
-- 
2.30.2

From 5e43a7af0fae5d8e75ad3c92237ff2935b97217f Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Wed, 21 Jul 2021 23:28:01 +0000
Subject: [PATCH v2 3/3] XXX Add POWER AltiVec support for UTF-8 validation.

XXX This isn't right yet
---
 src/port/pg_utf8_simd.c | 61 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/src/port/pg_utf8_simd.c b/src/port/pg_utf8_simd.c
index db0fe61b93..58978db213 100644
--- a/src/port/pg_utf8_simd.c
+++ b/src/port/pg_utf8_simd.c
@@ -19,6 +19,8 @@
 #define USE_NEON
 #elif defined(__x86_64__)
 #define USE_SSE
+#elif defined(__powerpc__)
+#define USE_ALTIVEC
 #else
 #error "Unsupported architecture"
 #endif
@@ -27,6 +29,8 @@
 #include <arm_neon.h>
 #elif defined(USE_SSE)
 #include <nmmintrin.h>
+#elif defined(USE_ALTIVEC)
+#include <altivec.h>
 #endif
 
 #include "port/pg_utf8.h"
@@ -35,6 +39,8 @@
 typedef uint8x16_t pg_u8x16_t;
 #elif defined(USE_SSE)
 typedef __m128i pg_u8x16_t;
+#elif defined(USE_ALTIVEC)
+typedef vector unsigned char pg_u8x16_t;
 #endif
 
 /*
@@ -211,6 +217,18 @@ vset(uint8 v0, uint8 v1, uint8 v2, uint8 v3,
        };
        return vld1q_u8(values);
 }
+#elif defined(USE_ALTIVEC)
+static pg_attribute_always_inline pg_u8x16_t
+vset(uint8 v0, uint8 v1, uint8 v2, uint8 v3,
+        uint8 v4, uint8 v5, uint8 v6, uint8 v7,
+        uint8 v8, uint8 v9, uint8 v10, uint8 v11,
+        uint8 v12, uint8 v13, uint8 v14, uint8 v15)
+{
+       pg_u8x16_t v = {
+               v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, 
v14, v15
+       };
+       return v;
+}
 #elif defined(USE_SSE)
 #define vset(...)              _mm_setr_epi8(__VA_ARGS__)
 #endif
@@ -221,6 +239,8 @@ vzero()
 {
 #if defined(USE_NEON)
        return vmovq_n_u8(0);
+#elif defined(USE_ALTIVEC)
+       return vset(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 #elif defined(USE_SSE)
        return _mm_setzero_si128();
 #endif
@@ -232,6 +252,8 @@ vload(const unsigned char *raw_input)
 {
 #if defined(USE_NEON)
        return vld1q_u8(raw_input);
+#elif defined(USE_ALTIVEC)
+       return vec_ld(0, raw_input);
 #elif defined(USE_SSE)
        return _mm_loadu_si128((const pg_u8x16_t *) raw_input);
 #endif
@@ -243,6 +265,8 @@ splat(char byte)
 {
 #if defined(USE_NEON)
        return vdupq_n_u8((unsigned char) byte);
+#elif defined(USE_ALTIVEC)
+       return vec_splats((unsigned char) byte);
 #elif defined(USE_SSE)
        return _mm_set1_epi8(byte);
 #endif
@@ -254,6 +278,8 @@ greater_than(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 #if defined(USE_NEON)
        return vcgtq_s8((int8x16_t) v1, (int8x16_t) v2);
+#elif defined(USE_ALTIVEC)
+       return (pg_u8x16_t) vec_cmpgt((vector signed char) v1, (vector signed 
char) v2);
 #elif defined(USE_SSE)
        return _mm_cmpgt_epi8(v1, v2);
 #endif
@@ -265,6 +291,8 @@ bitwise_and(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 #if defined(USE_NEON)
        return vandq_u8(v1, v2);
+#elif defined(USE_ALTIVEC)
+       return vec_and(v1, v2);
 #elif defined(USE_SSE)
        return _mm_and_si128(v1, v2);
 #endif
@@ -275,6 +303,8 @@ bitwise_or(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 #if defined(USE_NEON)
        return vorrq_u8(v1, v2);
+#elif defined(USE_ALTIVEC)
+       return vec_or(v1, v2);
 #elif defined(USE_SSE)
        return _mm_or_si128(v1, v2);
 #endif
@@ -285,6 +315,8 @@ bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 #if defined(USE_NEON)
        return veorq_u8(v1, v2);
+#elif defined(USE_ALTIVEC)
+       return vec_xor(v1, v2);
 #elif defined(USE_SSE)
        return _mm_xor_si128(v1, v2);
 #endif
@@ -300,6 +332,8 @@ saturating_sub(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 #if defined(USE_NEON)
        return vqsubq_u8(v1, v2);
+#elif defined(USE_ALTIVEC)
+       return vec_subs(v1, v2);
 #elif defined(USE_SSE)
        return _mm_subs_epu8(v1, v2);
 #endif
@@ -313,6 +347,9 @@ shift_right(const pg_u8x16_t v, const int n)
 {
 #if defined(USE_NEON)
        return vshrq_n_u8(v, n);
+#elif defined(USE_ALTIVEC)
+       /* XXX is there a shift right with a single value for n? */
+       return vec_sr(v, splat(n));
 #elif defined(USE_SSE)
        /*
         * There is no intrinsic to do this on 8-bit lanes, so shift right in 
each
@@ -326,6 +363,16 @@ shift_right(const pg_u8x16_t v, const int n)
 #endif
 }
 
+/*
+ * For little endian machines, the prevN functions need to do byte swapping
+ * here.  Is there a way to avoid this?
+ */
+#ifdef WORDS_BIGENDIAN
+#define rev(a) (a)
+#else
+#define rev(a) ((pg_u8x16_t) (vec_reve((_v16qu) (a))))
+#endif
+
 /*
  * Shift entire 'input' register right by N 8-bit lanes, and
  * replace the first N lanes with the last N lanes from the
@@ -341,6 +388,8 @@ prev1(pg_u8x16_t prev, pg_u8x16_t input)
 {
 #if defined(USE_NEON)
        return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 1);
+#elif defined(USE_ALTIVEC)
+       return rev(vec_sld(rev(prev), rev(input), sizeof(pg_u8x16_t) - 1));
 #elif defined(USE_SSE)
        return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 1);
 #endif
@@ -351,6 +400,8 @@ prev2(pg_u8x16_t prev, pg_u8x16_t input)
 {
 #if defined(USE_NEON)
        return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 2);
+#elif defined(USE_ALTIVEC)
+       return rev(vec_sld(rev(prev), rev(input), sizeof(pg_u8x16_t) - 2));
 #elif defined(USE_SSE)
        return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 2);
 #endif
@@ -361,6 +412,8 @@ prev3(pg_u8x16_t prev, pg_u8x16_t input)
 {
 #if defined(USE_NEON)
        return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 3);
+#elif defined(USE_ALTIVEC)
+       return rev(vec_sld(rev(prev), rev(input), sizeof(pg_u8x16_t) - 3));
 #elif defined(USE_SSE)
        return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 3);
 #endif
@@ -375,6 +428,8 @@ lookup(const pg_u8x16_t input, const pg_u8x16_t lookup)
 {
 #if defined(USE_NEON)
        return vqtbl1q_u8(lookup, input);
+#elif defined(USE_ALTIVEC)
+       return vec_perm(lookup, vzero(), input);
 #elif defined(USE_SSE)
        return _mm_shuffle_epi8(lookup, input);
 #endif
@@ -488,6 +543,8 @@ to_bool(const pg_u8x16_t v)
 {
 #if defined(USE_NEON)
        return vmaxvq_u8(v) != 0;
+#elif defined(USE_ALTIVEC)
+       return !vec_all_eq(v, vzero());
 #elif defined(USE_SSE)
        /*
         * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
@@ -503,6 +560,8 @@ check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error)
 {
 #if defined(USE_NEON)
        const           pg_u8x16_t cmp = vceqq_u8(v, vzero());
+#elif defined(USE_ALTIVEC)
+       const           pg_u8x16_t cmp = (pg_u8x16_t) vec_cmpeq(v, vzero());
 #elif defined(USE_SSE)
        const           pg_u8x16_t cmp = _mm_cmpeq_epi8(v, vzero());
 #endif
@@ -516,6 +575,8 @@ is_highbit_set(const pg_u8x16_t v)
 {
 #if defined(USE_NEON)
        return vmaxvq_u8(v) > 0x7F;
+#elif defined(USE_ALTIVEC)
+       return to_bool(vec_and(v, splat(0x80)));
 #elif defined(USE_SSE)
        return _mm_movemask_epi8(v) != 0;
 #endif
-- 
2.30.2

Reply via email to