On Thu, Jul 22, 2021 at 6:16 AM John Naylor <john.nay...@enterprisedb.com> wrote: > Neat! It's good to make it more architecture-agnostic, and I'm sure we can > use quite a bit of this.
One question is whether this "one size fits all" approach will be extensible to wider SIMD. > to_bool(const pg_u8x16_t v) > { > +#if defined(USE_NEON) > + return vmaxvq_u32((uint32x4_t) v) != 0; > > --> return vmaxvq_u8(*this) != 0; I chose that lane width because I saw an unsubstantiated claim somewhere that it might be faster, but I have no idea if it matters. The u8 code looks more natural anyway. Changed. > vzero() > { > +#if defined(USE_NEON) > + return vmovq_n_u8(0); > > --> return vdupq_n_u8(0); // or equivalently, splat(0) I guess it doesn't make a difference which builtin you use here, but I was influenced by the ARM manual which says the vdupq form is generated for immediate values. > is_highbit_set(const pg_u8x16_t v) > { > +#if defined(USE_NEON) > + return to_bool(bitwise_and(v, vmovq_n_u8(0x80))); > > --> return vmaxq_u8(v) > 0x7F Ah, of course. Much nicer! > +#if defined(USE_NEON) > +static pg_attribute_always_inline pg_u8x16_t > +vset(uint8 v0, uint8 v1, uint8 v2, uint8 v3, > + uint8 v4, uint8 v5, uint8 v6, uint8 v7, > + uint8 v8, uint8 v9, uint8 v10, uint8 v11, > + uint8 v12, uint8 v13, uint8 v14, uint8 v15) > +{ > + uint8 pg_attribute_aligned(16) values[16] = { > + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 > + }; > + return vld1q_u8(values); > +} > > --> They have this strange beast instead: > > // Doing a load like so end ups generating worse code. > // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, > // x9, x10,x11,x12,x13,x14,x15,x16}; > // return vld1q_u8(array); > uint8x16_t x{}; > // incredibly, Visual Studio does not allow x[0] = x1 > x = vsetq_lane_u8(x1, x, 0); > x = vsetq_lane_u8(x2, x, 1); > x = vsetq_lane_u8(x3, x, 2); > ... > x = vsetq_lane_u8(x15, x, 14); > x = vsetq_lane_u8(x16, x, 15); > return x; > > Since you aligned the array, that might not have the problem alluded to > above, and it looks nicer. Strange indeed. We should probably poke around in the assember and see... it might be that MSVC doesn't like it, and I was just cargo-culting the alignment. I don't expect the generated code to really "load" anything of course, it should ideally be some kind of immediate mov... FWIW here are some performance results from my humble RPI4: master: chinese | mixed | ascii ---------+-------+------- 4172 | 2763 | 1823 (1 row) Your v15 patch: chinese | mixed | ascii ---------+-------+------- 2267 | 1248 | 399 (1 row) Your v15 patch set + the NEON patch, configured with USE_UTF8_SIMD=1: chinese | mixed | ascii ---------+-------+------- 909 | 620 | 318 (1 row) It's so good I wonder if it's producing incorrect results :-) I also tried to do a quick and dirty AltiVec patch to see if it could fit into the same code "shape", with less immediate success: it works out slower than the fallback code on the POWER7 machine I scrounged an account on. I'm not sure what's wrong there, but maybe it's a uesful start (I'm probably confused about endianness, or the encoding of boolean vectors which may be different (is true 0x01or 0xff, does it matter?), or something else, and it's falling back on errors all the time?).
From 1464c22da33117900341496d03b92dec5be2a62a Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Thu, 22 Jul 2021 02:05:06 +1200 Subject: [PATCH v2 1/3] XXX Make SIMD code more platform neutral. Move SIMD code into pg_utf_simd.c to experiment with the idea of a shared implementation across architectures. Introduce pg_u8x16_t to abstract vector type. XXX Experiment grade code only --- configure | 18 +-- configure.ac | 18 +-- src/include/pg_config.h.in | 14 +- src/include/port/pg_utf8.h | 10 +- src/port/Makefile | 8 +- ...g_utf8_sse42_choose.c => pg_utf8_choose.c} | 10 +- src/port/{pg_utf8_sse42.c => pg_utf8_simd.c} | 148 +++++++++--------- 7 files changed, 114 insertions(+), 112 deletions(-) rename src/port/{pg_utf8_sse42_choose.c => pg_utf8_choose.c} (88%) rename src/port/{pg_utf8_sse42.c => pg_utf8_simd.c} (76%) diff --git a/configure b/configure index 30969840b1..df546b641c 100755 --- a/configure +++ b/configure @@ -18442,13 +18442,13 @@ fi # # You can override this logic by setting the appropriate USE_*_UTF8 flag to 1 # in the template or configure command line. -if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then +if test x"$USE_SIMD_UTF8" = x"" && test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then - USE_SSE42_UTF8=1 + USE_SIMD_UTF8=1 else # the CPUID instruction is needed for the runtime check. if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then - USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1 + USE_SIMD_UTF8_WITH_RUNTIME_CHECK=1 else # fall back to algorithm which doesn't require any special # CPU support. @@ -18461,19 +18461,19 @@ fi # Note: We need the fallback for error handling in all builds. { $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5 $as_echo_n "checking which UTF-8 validator to use... " >&6; } -if test x"$USE_SSE42_UTF8" = x"1"; then +if test x"$USE_SIMD_UTF8" = x"1"; then -$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h +$as_echo "#define USE_SIMD_UTF8 1" >>confdefs.h - PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o" + PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5 $as_echo "SSE 4.2" >&6; } else - if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then + if test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"1"; then -$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h +$as_echo "#define USE_SIMD_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o" + PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o pg_utf8_choose.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5 $as_echo "SSE 4.2 with runtime check" >&6; } else diff --git a/configure.ac b/configure.ac index 5e2b4717c1..1606a80fb7 100644 --- a/configure.ac +++ b/configure.ac @@ -2217,13 +2217,13 @@ AC_SUBST(PG_CRC32C_OBJS) # # You can override this logic by setting the appropriate USE_*_UTF8 flag to 1 # in the template or configure command line. -if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then +if test x"$USE_SIMD_UTF8" = x"" && test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then - USE_SSE42_UTF8=1 + USE_SIMD_UTF8=1 else # the CPUID instruction is needed for the runtime check. if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then - USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1 + USE_SIMD_UTF8_WITH_RUNTIME_CHECK=1 else # fall back to algorithm which doesn't require any special # CPU support. @@ -2235,14 +2235,14 @@ fi # Set PG_UTF8_OBJS appropriately depending on the selected implementation. # Note: We need the fallback for error handling in all builds. AC_MSG_CHECKING([which UTF-8 validator to use]) -if test x"$USE_SSE42_UTF8" = x"1"; then - AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.]) - PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o" +if test x"$USE_SIMD_UTF8" = x"1"; then + AC_DEFINE(USE_SIMD_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.]) + PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o" AC_MSG_RESULT(SSE 4.2) else - if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then - AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.]) - PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o" + if test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"1"; then + AC_DEFINE(USE_SIMD_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.]) + PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o pg_utf8_choose.o" AC_MSG_RESULT(SSE 4.2 with runtime check) else AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.]) diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 9d5e1efda9..f1456553e6 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -904,6 +904,9 @@ /* Define to 1 to build with BSD Authentication support. (--with-bsd-auth) */ #undef USE_BSD_AUTH +/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */ +#undef USE_FALLBACK_UTF8 + /* Define to build with ICU support. (--with-icu) */ #undef USE_ICU @@ -932,14 +935,11 @@ /* Define to 1 to build with PAM support. (--with-pam) */ #undef USE_PAM -/* Define to 1 to use the fallback UTF-8 validator written in C. */ -#undef USE_FALLBACK_UTF8 - -/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */ -#undef USE_SSE42_UTF8 +/* Define to 1 use Intel SSE 4.2 instructions. */ +#undef USE_SIMD_UTF8 -/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */ -#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK +/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */ +#undef USE_SIMD_UTF8_WITH_RUNTIME_CHECK /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */ #undef USE_SLICING_BY_8_CRC32C diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h index dc38369a31..a9f2b9f15b 100644 --- a/src/include/port/pg_utf8.h +++ b/src/include/port/pg_utf8.h @@ -15,14 +15,14 @@ #define PG_UTF8_H -#if defined(USE_SSE42_UTF8) +#if defined(USE_SIMD_UTF8) /* Use Intel SSE4.2 instructions. */ #define UTF8_VERIFYSTR(s, len) \ - pg_validate_utf8_sse42((s), (len)) + pg_validate_utf8_simd((s), (len)) -extern int pg_validate_utf8_sse42(const unsigned char *s, int len); +extern int pg_validate_utf8_simd(const unsigned char *s, int len); -#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK) +#elif defined(USE_SIMD_UTF8_WITH_RUNTIME_CHECK) /* * Use Intel SSE 4.2 instructions, but perform a runtime check first * to check that they are available. @@ -31,7 +31,7 @@ extern int pg_validate_utf8_sse42(const unsigned char *s, int len); pg_validate_utf8((s), (len)) extern int (*pg_validate_utf8) (const unsigned char *s, int len); -extern int pg_validate_utf8_sse42(const unsigned char *s, int len); +extern int pg_validate_utf8_simd(const unsigned char *s, int len); #else #define UTF8_VERIFYSTR(s, len) \ diff --git a/src/port/Makefile b/src/port/Makefile index 04838b0ab2..893fcd7d59 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -90,10 +90,10 @@ libpgport.a: $(OBJS) thread.o: CFLAGS+=$(PTHREAD_CFLAGS) thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS) -# all versions of pg_utf8_sse42.o need CFLAGS_SSE42 -pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42) -pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42) -pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42) +# all versions of pg_utf8_simd.o need CFLAGS_SSE42 +pg_utf8_simd.o: CFLAGS+=$(CFLAGS_SSE42) +pg_utf8_simd_shlib.o: CFLAGS+=$(CFLAGS_SSE42) +pg_utf8_simd_srv.o: CFLAGS+=$(CFLAGS_SSE42) # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42) diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_choose.c similarity index 88% rename from src/port/pg_utf8_sse42_choose.c rename to src/port/pg_utf8_choose.c index ff6120be2b..140c0dce7b 100644 --- a/src/port/pg_utf8_sse42_choose.c +++ b/src/port/pg_utf8_choose.c @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * - * pg_utf8_sse42_choose.c - * Choose between Intel SSE 4.2 and fallback implementation. + * pg_utf8_choose.c + * Choose between SSE 4.2 and fallback implementation. * * On first call, checks if the CPU we're running on supports Intel SSE * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise, @@ -30,7 +30,7 @@ #include "port/pg_utf8.h" static bool -pg_utf8_sse42_available(void) +pg_utf8_simd_available(void) { /* To save from checking every SSE2 intrinsic, insist on 64-bit. */ #ifdef __x86_64__ @@ -57,8 +57,8 @@ pg_utf8_sse42_available(void) static int pg_validate_utf8_choose(const unsigned char *s, int len) { - if (pg_utf8_sse42_available()) - pg_validate_utf8 = pg_validate_utf8_sse42; + if (pg_utf8_simd_available()) + pg_validate_utf8 = pg_validate_utf8_simd; else pg_validate_utf8 = pg_validate_utf8_fallback; diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_simd.c similarity index 76% rename from src/port/pg_utf8_sse42.c rename to src/port/pg_utf8_simd.c index cd050ec2bf..7ca9060e3a 100644 --- a/src/port/pg_utf8_sse42.c +++ b/src/port/pg_utf8_simd.c @@ -1,6 +1,6 @@ /*------------------------------------------------------------------------- * - * pg_utf8_sse42.c + * pg_utf8_simd.c * Validate UTF-8 using Intel SSE 4.2 instructions. * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * src/port/pg_utf8_sse42.c + * src/port/pg_utf8_simd.c * *------------------------------------------------------------------------- */ @@ -19,6 +19,8 @@ #include "port/pg_utf8.h" +typedef __m128i pg_u8x16_t; + /* * This module is based on the paper "Validating UTF-8 In Less Than One * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090 @@ -184,48 +186,48 @@ #define vset(...) _mm_setr_epi8(__VA_ARGS__) /* return a zeroed register */ -static inline const __m128i +static inline const pg_u8x16_t vzero() { return _mm_setzero_si128(); } /* perform an unaligned load from memory into a register */ -static inline const __m128i +static inline const pg_u8x16_t vload(const unsigned char *raw_input) { - return _mm_loadu_si128((const __m128i *) raw_input); + return _mm_loadu_si128((const pg_u8x16_t *) raw_input); } /* return a vector with each 8-bit lane populated with the input scalar */ -static inline __m128i +static inline pg_u8x16_t splat(char byte) { return _mm_set1_epi8(byte); } /* perform signed greater-than on all 8-bit lanes */ -static inline __m128i -greater_than(const __m128i v1, const __m128i v2) +static inline pg_u8x16_t +greater_than(const pg_u8x16_t v1, const pg_u8x16_t v2) { return _mm_cmpgt_epi8(v1, v2); } /* bitwise vector operations */ -static inline __m128i -bitwise_and(const __m128i v1, const __m128i v2) +static inline pg_u8x16_t +bitwise_and(const pg_u8x16_t v1, const pg_u8x16_t v2) { return _mm_and_si128(v1, v2); } -static inline __m128i -bitwise_or(const __m128i v1, const __m128i v2) +static inline pg_u8x16_t +bitwise_or(const pg_u8x16_t v1, const pg_u8x16_t v2) { return _mm_or_si128(v1, v2); } -static inline __m128i -bitwise_xor(const __m128i v1, const __m128i v2) +static inline pg_u8x16_t +bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2) { return _mm_xor_si128(v1, v2); } @@ -235,8 +237,8 @@ bitwise_xor(const __m128i v1, const __m128i v2) * on overflow, stop at zero. Useful for emulating unsigned * comparison. */ -static inline __m128i -saturating_sub(const __m128i v1, const __m128i v2) +static inline pg_u8x16_t +saturating_sub(const pg_u8x16_t v1, const pg_u8x16_t v2) { return _mm_subs_epu8(v1, v2); } @@ -247,11 +249,11 @@ saturating_sub(const __m128i v1, const __m128i v2) * There is no intrinsic to do this on 8-bit lanes, so shift right in each * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount. */ -static inline __m128i -shift_right(const __m128i v, const int n) +static inline pg_u8x16_t +shift_right(const pg_u8x16_t v, const int n) { - const __m128i shift16 = _mm_srli_epi16(v, n); - const __m128i mask = splat(0xFF >> n); + const pg_u8x16_t shift16 = _mm_srli_epi16(v, n); + const pg_u8x16_t mask = splat(0xFF >> n); return bitwise_and(shift16, mask); } @@ -266,30 +268,30 @@ shift_right(const __m128i v, const int n) * The third argument to the intrinsic must be a numeric constant, so * we must have separate functions for different shift amounts. */ -static inline __m128i -prev1(__m128i prev, __m128i input) +static inline pg_u8x16_t +prev1(pg_u8x16_t prev, pg_u8x16_t input) { - return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1); + return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 1); } -static inline __m128i -prev2(__m128i prev, __m128i input) +static inline pg_u8x16_t +prev2(pg_u8x16_t prev, pg_u8x16_t input) { - return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2); + return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 2); } -static inline __m128i -prev3(__m128i prev, __m128i input) +static inline pg_u8x16_t +prev3(pg_u8x16_t prev, pg_u8x16_t input) { - return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3); + return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 3); } /* * For each 8-bit lane in the input, use that value as an index * into the lookup vector as if it were a 16-element byte array. */ -static inline __m128i -lookup(const __m128i input, const __m128i lookup) +static inline pg_u8x16_t +lookup(const pg_u8x16_t input, const pg_u8x16_t lookup) { return _mm_shuffle_epi8(lookup, input); } @@ -298,28 +300,28 @@ lookup(const __m128i input, const __m128i lookup) * Return a vector with lanes non-zero where we have either errors, or * two or more continuations in a row. */ -static inline __m128i -check_special_cases(const __m128i prev, const __m128i input) +static inline pg_u8x16_t +check_special_cases(const pg_u8x16_t prev, const pg_u8x16_t input) { - const __m128i byte_1_high_table = vset(BYTE_1_HIGH_TABLE); - const __m128i byte_1_low_table = vset(BYTE_1_LOW_TABLE); - const __m128i byte_2_high_table = vset(BYTE_2_HIGH_TABLE); + const pg_u8x16_t byte_1_high_table = vset(BYTE_1_HIGH_TABLE); + const pg_u8x16_t byte_1_low_table = vset(BYTE_1_LOW_TABLE); + const pg_u8x16_t byte_2_high_table = vset(BYTE_2_HIGH_TABLE); /* * To classify the first byte in each chunk we need to have the last byte * from the previous chunk. */ - const __m128i input_shift1 = prev1(prev, input); + const pg_u8x16_t input_shift1 = prev1(prev, input); /* put the relevant nibbles into their own bytes in their own registers */ - const __m128i byte_1_high = shift_right(input_shift1, 4); - const __m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F)); - const __m128i byte_2_high = shift_right(input, 4); + const pg_u8x16_t byte_1_high = shift_right(input_shift1, 4); + const pg_u8x16_t byte_1_low = bitwise_and(input_shift1, splat(0x0F)); + const pg_u8x16_t byte_2_high = shift_right(input, 4); /* lookup the possible errors for each set of nibbles */ - const __m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table); - const __m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table); - const __m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table); + const pg_u8x16_t lookup_1_high = lookup(byte_1_high, byte_1_high_table); + const pg_u8x16_t lookup_1_low = lookup(byte_1_low, byte_1_low_table); + const pg_u8x16_t lookup_2_high = lookup(byte_2_high, byte_2_high_table); /* * AND all the lookups together. At this point, non-zero lanes in the @@ -331,7 +333,7 @@ check_special_cases(const __m128i prev, const __m128i input) * * 3. the third continuation byte of a 4-byte character */ - const __m128i temp = bitwise_and(lookup_1_high, lookup_1_low); + const pg_u8x16_t temp = bitwise_and(lookup_1_high, lookup_1_low); return bitwise_and(temp, lookup_2_high); } @@ -340,22 +342,22 @@ check_special_cases(const __m128i prev, const __m128i input) * Return a vector with lanes set to TWO_CONTS where we expect to find two * continuations in a row. These are valid only within 3- and 4-byte sequences. */ -static inline __m128i -check_multibyte_lengths(const __m128i prev, const __m128i input) +static inline pg_u8x16_t +check_multibyte_lengths(const pg_u8x16_t prev, const pg_u8x16_t input) { /* * Populate registers that contain the input shifted right by 2 and 3 * bytes, filling in the left lanes from the previous input. */ - const __m128i input_shift2 = prev2(prev, input); - const __m128i input_shift3 = prev3(prev, input); + const pg_u8x16_t input_shift2 = prev2(prev, input); + const pg_u8x16_t input_shift3 = prev3(prev, input); /* * Constants for comparison. Any 3-byte lead is greater than * MAX_TWO_BYTE_LEAD, etc. */ - const __m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD); - const __m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD); + const pg_u8x16_t max_lead2 = splat(MAX_TWO_BYTE_LEAD); + const pg_u8x16_t max_lead3 = splat(MAX_THREE_BYTE_LEAD); /* * Look in the shifted registers for 3- or 4-byte leads. There is no @@ -363,17 +365,17 @@ check_multibyte_lengths(const __m128i prev, const __m128i input) * signed comparison with zero. Any non-zero bytes in the result represent * valid leads. */ - const __m128i is_third_byte = saturating_sub(input_shift2, max_lead2); - const __m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3); + const pg_u8x16_t is_third_byte = saturating_sub(input_shift2, max_lead2); + const pg_u8x16_t is_fourth_byte = saturating_sub(input_shift3, max_lead3); /* OR them together for easier comparison */ - const __m128i temp = bitwise_or(is_third_byte, is_fourth_byte); + const pg_u8x16_t temp = bitwise_or(is_third_byte, is_fourth_byte); /* * Set all bits in each 8-bit lane if the result is greater than zero. * Signed arithmetic is okay because the values are small. */ - const __m128i must23 = greater_than(temp, vzero()); + const pg_u8x16_t must23 = greater_than(temp, vzero()); /* * We want to compare with the result of check_special_cases() so apply a @@ -385,20 +387,20 @@ check_multibyte_lengths(const __m128i prev, const __m128i input) /* set bits in the error vector where we find invalid UTF-8 input */ static inline void -check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error) +check_utf8_bytes(const pg_u8x16_t prev, const pg_u8x16_t input, pg_u8x16_t * error) { - const __m128i special_cases = check_special_cases(prev, input); - const __m128i expect_two_conts = check_multibyte_lengths(prev, input); + const pg_u8x16_t special_cases = check_special_cases(prev, input); + const pg_u8x16_t expect_two_conts = check_multibyte_lengths(prev, input); /* If the two cases are identical, this will be zero. */ - const __m128i result = bitwise_xor(expect_two_conts, special_cases); + const pg_u8x16_t result = bitwise_xor(expect_two_conts, special_cases); *error = bitwise_or(*error, result); } /* return false if a register is zero, true otherwise */ static inline bool -to_bool(const __m128i v) +to_bool(const pg_u8x16_t v) { /* * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is @@ -409,25 +411,25 @@ to_bool(const __m128i v) /* set bits in the error vector where bytes in the input are zero */ static inline void -check_for_zeros(const __m128i v, __m128i * error) +check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error) { - const __m128i cmp = _mm_cmpeq_epi8(v, vzero()); + const pg_u8x16_t cmp = _mm_cmpeq_epi8(v, vzero()); *error = bitwise_or(*error, cmp); } /* vector version of IS_HIGHBIT_SET() */ static inline bool -is_highbit_set(const __m128i v) +is_highbit_set(const pg_u8x16_t v) { return _mm_movemask_epi8(v) != 0; } /* return non-zero if the input terminates with an incomplete code point */ -static inline __m128i -is_incomplete(const __m128i v) +static inline pg_u8x16_t +is_incomplete(const pg_u8x16_t v) { - const __m128i max_array = + const pg_u8x16_t max_array = vset(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, @@ -440,20 +442,20 @@ is_incomplete(const __m128i v) * See the comment in common/wchar.c under "multibyte sequence validators". */ int -pg_validate_utf8_sse42(const unsigned char *s, int len) +pg_validate_utf8_simd(const unsigned char *s, int len) { const unsigned char *start = s; const int orig_len = len; - __m128i error = vzero(); - __m128i prev = vzero(); - __m128i prev_incomplete = vzero(); - __m128i input; + pg_u8x16_t error = vzero(); + pg_u8x16_t prev = vzero(); + pg_u8x16_t prev_incomplete = vzero(); + pg_u8x16_t input; /* * NB: This check must be strictly greater-than, otherwise an invalid byte * at the end might not get detected. */ - while (len > sizeof(__m128i)) + while (len > sizeof(pg_u8x16_t)) { input = vload(s); @@ -474,8 +476,8 @@ pg_validate_utf8_sse42(const unsigned char *s, int len) } prev = input; - s += sizeof(__m128i); - len -= sizeof(__m128i); + s += sizeof(pg_u8x16_t); + len -= sizeof(pg_u8x16_t); } /* -- 2.30.2
From 8f2b0f0dfe70699695f9a3a64b4c737201791da0 Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Thu, 22 Jul 2021 02:05:30 +1200 Subject: [PATCH v2 2/3] XXX Add ARM/NEON support for UTF-8 validation. Needs configure checks. Needs "choose" logic. Probably a SIGILL test, as done elsewhere? For now works only if you configure with USE_UTF8_SIMD=1. XXX Experiment grade code only --- src/port/pg_utf8_simd.c | 102 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 99 insertions(+), 3 deletions(-) diff --git a/src/port/pg_utf8_simd.c b/src/port/pg_utf8_simd.c index 7ca9060e3a..db0fe61b93 100644 --- a/src/port/pg_utf8_simd.c +++ b/src/port/pg_utf8_simd.c @@ -15,11 +15,27 @@ #include "c.h" +#if defined(__aarch64__) +#define USE_NEON +#elif defined(__x86_64__) +#define USE_SSE +#else +#error "Unsupported architecture" +#endif + +#if defined(USE_NEON) +#include <arm_neon.h> +#elif defined(USE_SSE) #include <nmmintrin.h> +#endif #include "port/pg_utf8.h" +#if defined(USE_NEON) +typedef uint8x16_t pg_u8x16_t; +#elif defined(USE_SSE) typedef __m128i pg_u8x16_t; +#endif /* * This module is based on the paper "Validating UTF-8 In Less Than One @@ -183,53 +199,95 @@ typedef __m128i pg_u8x16_t; /* helper functions to wrap intrinsics */ +#if defined(USE_NEON) +static pg_attribute_always_inline pg_u8x16_t +vset(uint8 v0, uint8 v1, uint8 v2, uint8 v3, + uint8 v4, uint8 v5, uint8 v6, uint8 v7, + uint8 v8, uint8 v9, uint8 v10, uint8 v11, + uint8 v12, uint8 v13, uint8 v14, uint8 v15) +{ + uint8 pg_attribute_aligned(16) values[16] = { + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 + }; + return vld1q_u8(values); +} +#elif defined(USE_SSE) #define vset(...) _mm_setr_epi8(__VA_ARGS__) +#endif /* return a zeroed register */ static inline const pg_u8x16_t vzero() { +#if defined(USE_NEON) + return vmovq_n_u8(0); +#elif defined(USE_SSE) return _mm_setzero_si128(); +#endif } /* perform an unaligned load from memory into a register */ static inline const pg_u8x16_t vload(const unsigned char *raw_input) { +#if defined(USE_NEON) + return vld1q_u8(raw_input); +#elif defined(USE_SSE) return _mm_loadu_si128((const pg_u8x16_t *) raw_input); +#endif } /* return a vector with each 8-bit lane populated with the input scalar */ static inline pg_u8x16_t splat(char byte) { +#if defined(USE_NEON) + return vdupq_n_u8((unsigned char) byte); +#elif defined(USE_SSE) return _mm_set1_epi8(byte); +#endif } /* perform signed greater-than on all 8-bit lanes */ static inline pg_u8x16_t greater_than(const pg_u8x16_t v1, const pg_u8x16_t v2) { +#if defined(USE_NEON) + return vcgtq_s8((int8x16_t) v1, (int8x16_t) v2); +#elif defined(USE_SSE) return _mm_cmpgt_epi8(v1, v2); +#endif } /* bitwise vector operations */ static inline pg_u8x16_t bitwise_and(const pg_u8x16_t v1, const pg_u8x16_t v2) { +#if defined(USE_NEON) + return vandq_u8(v1, v2); +#elif defined(USE_SSE) return _mm_and_si128(v1, v2); +#endif } static inline pg_u8x16_t bitwise_or(const pg_u8x16_t v1, const pg_u8x16_t v2) { +#if defined(USE_NEON) + return vorrq_u8(v1, v2); +#elif defined(USE_SSE) return _mm_or_si128(v1, v2); +#endif } static inline pg_u8x16_t bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2) { +#if defined(USE_NEON) + return veorq_u8(v1, v2); +#elif defined(USE_SSE) return _mm_xor_si128(v1, v2); +#endif } /* @@ -240,22 +298,32 @@ bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2) static inline pg_u8x16_t saturating_sub(const pg_u8x16_t v1, const pg_u8x16_t v2) { +#if defined(USE_NEON) + return vqsubq_u8(v1, v2); +#elif defined(USE_SSE) return _mm_subs_epu8(v1, v2); +#endif } /* * Shift right each 8-bit lane - * - * There is no intrinsic to do this on 8-bit lanes, so shift right in each - * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount. */ static inline pg_u8x16_t shift_right(const pg_u8x16_t v, const int n) { +#if defined(USE_NEON) + return vshrq_n_u8(v, n); +#elif defined(USE_SSE) + /* + * There is no intrinsic to do this on 8-bit lanes, so shift right in each + * 16-bit lane then apply a mask in each 8-bit lane shifted the same + * amount. + */ const pg_u8x16_t shift16 = _mm_srli_epi16(v, n); const pg_u8x16_t mask = splat(0xFF >> n); return bitwise_and(shift16, mask); +#endif } /* @@ -271,19 +339,31 @@ shift_right(const pg_u8x16_t v, const int n) static inline pg_u8x16_t prev1(pg_u8x16_t prev, pg_u8x16_t input) { +#if defined(USE_NEON) + return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 1); +#elif defined(USE_SSE) return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 1); +#endif } static inline pg_u8x16_t prev2(pg_u8x16_t prev, pg_u8x16_t input) { +#if defined(USE_NEON) + return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 2); +#elif defined(USE_SSE) return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 2); +#endif } static inline pg_u8x16_t prev3(pg_u8x16_t prev, pg_u8x16_t input) { +#if defined(USE_NEON) + return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 3); +#elif defined(USE_SSE) return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 3); +#endif } /* @@ -293,7 +373,11 @@ prev3(pg_u8x16_t prev, pg_u8x16_t input) static inline pg_u8x16_t lookup(const pg_u8x16_t input, const pg_u8x16_t lookup) { +#if defined(USE_NEON) + return vqtbl1q_u8(lookup, input); +#elif defined(USE_SSE) return _mm_shuffle_epi8(lookup, input); +#endif } /* @@ -402,18 +486,26 @@ check_utf8_bytes(const pg_u8x16_t prev, const pg_u8x16_t input, pg_u8x16_t * err static inline bool to_bool(const pg_u8x16_t v) { +#if defined(USE_NEON) + return vmaxvq_u8(v) != 0; +#elif defined(USE_SSE) /* * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is * zero. Zero is the only value whose bitwise AND with itself is zero. */ return !_mm_testz_si128(v, v); +#endif } /* set bits in the error vector where bytes in the input are zero */ static inline void check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error) { +#if defined(USE_NEON) + const pg_u8x16_t cmp = vceqq_u8(v, vzero()); +#elif defined(USE_SSE) const pg_u8x16_t cmp = _mm_cmpeq_epi8(v, vzero()); +#endif *error = bitwise_or(*error, cmp); } @@ -422,7 +514,11 @@ check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error) static inline bool is_highbit_set(const pg_u8x16_t v) { +#if defined(USE_NEON) + return vmaxvq_u8(v) > 0x7F; +#elif defined(USE_SSE) return _mm_movemask_epi8(v) != 0; +#endif } /* return non-zero if the input terminates with an incomplete code point */ -- 2.30.2
From 5e43a7af0fae5d8e75ad3c92237ff2935b97217f Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Wed, 21 Jul 2021 23:28:01 +0000 Subject: [PATCH v2 3/3] XXX Add POWER AltiVec support for UTF-8 validation. XXX This isn't right yet --- src/port/pg_utf8_simd.c | 61 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/src/port/pg_utf8_simd.c b/src/port/pg_utf8_simd.c index db0fe61b93..58978db213 100644 --- a/src/port/pg_utf8_simd.c +++ b/src/port/pg_utf8_simd.c @@ -19,6 +19,8 @@ #define USE_NEON #elif defined(__x86_64__) #define USE_SSE +#elif defined(__powerpc__) +#define USE_ALTIVEC #else #error "Unsupported architecture" #endif @@ -27,6 +29,8 @@ #include <arm_neon.h> #elif defined(USE_SSE) #include <nmmintrin.h> +#elif defined(USE_ALTIVEC) +#include <altivec.h> #endif #include "port/pg_utf8.h" @@ -35,6 +39,8 @@ typedef uint8x16_t pg_u8x16_t; #elif defined(USE_SSE) typedef __m128i pg_u8x16_t; +#elif defined(USE_ALTIVEC) +typedef vector unsigned char pg_u8x16_t; #endif /* @@ -211,6 +217,18 @@ vset(uint8 v0, uint8 v1, uint8 v2, uint8 v3, }; return vld1q_u8(values); } +#elif defined(USE_ALTIVEC) +static pg_attribute_always_inline pg_u8x16_t +vset(uint8 v0, uint8 v1, uint8 v2, uint8 v3, + uint8 v4, uint8 v5, uint8 v6, uint8 v7, + uint8 v8, uint8 v9, uint8 v10, uint8 v11, + uint8 v12, uint8 v13, uint8 v14, uint8 v15) +{ + pg_u8x16_t v = { + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 + }; + return v; +} #elif defined(USE_SSE) #define vset(...) _mm_setr_epi8(__VA_ARGS__) #endif @@ -221,6 +239,8 @@ vzero() { #if defined(USE_NEON) return vmovq_n_u8(0); +#elif defined(USE_ALTIVEC) + return vset(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); #elif defined(USE_SSE) return _mm_setzero_si128(); #endif @@ -232,6 +252,8 @@ vload(const unsigned char *raw_input) { #if defined(USE_NEON) return vld1q_u8(raw_input); +#elif defined(USE_ALTIVEC) + return vec_ld(0, raw_input); #elif defined(USE_SSE) return _mm_loadu_si128((const pg_u8x16_t *) raw_input); #endif @@ -243,6 +265,8 @@ splat(char byte) { #if defined(USE_NEON) return vdupq_n_u8((unsigned char) byte); +#elif defined(USE_ALTIVEC) + return vec_splats((unsigned char) byte); #elif defined(USE_SSE) return _mm_set1_epi8(byte); #endif @@ -254,6 +278,8 @@ greater_than(const pg_u8x16_t v1, const pg_u8x16_t v2) { #if defined(USE_NEON) return vcgtq_s8((int8x16_t) v1, (int8x16_t) v2); +#elif defined(USE_ALTIVEC) + return (pg_u8x16_t) vec_cmpgt((vector signed char) v1, (vector signed char) v2); #elif defined(USE_SSE) return _mm_cmpgt_epi8(v1, v2); #endif @@ -265,6 +291,8 @@ bitwise_and(const pg_u8x16_t v1, const pg_u8x16_t v2) { #if defined(USE_NEON) return vandq_u8(v1, v2); +#elif defined(USE_ALTIVEC) + return vec_and(v1, v2); #elif defined(USE_SSE) return _mm_and_si128(v1, v2); #endif @@ -275,6 +303,8 @@ bitwise_or(const pg_u8x16_t v1, const pg_u8x16_t v2) { #if defined(USE_NEON) return vorrq_u8(v1, v2); +#elif defined(USE_ALTIVEC) + return vec_or(v1, v2); #elif defined(USE_SSE) return _mm_or_si128(v1, v2); #endif @@ -285,6 +315,8 @@ bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2) { #if defined(USE_NEON) return veorq_u8(v1, v2); +#elif defined(USE_ALTIVEC) + return vec_xor(v1, v2); #elif defined(USE_SSE) return _mm_xor_si128(v1, v2); #endif @@ -300,6 +332,8 @@ saturating_sub(const pg_u8x16_t v1, const pg_u8x16_t v2) { #if defined(USE_NEON) return vqsubq_u8(v1, v2); +#elif defined(USE_ALTIVEC) + return vec_subs(v1, v2); #elif defined(USE_SSE) return _mm_subs_epu8(v1, v2); #endif @@ -313,6 +347,9 @@ shift_right(const pg_u8x16_t v, const int n) { #if defined(USE_NEON) return vshrq_n_u8(v, n); +#elif defined(USE_ALTIVEC) + /* XXX is there a shift right with a single value for n? */ + return vec_sr(v, splat(n)); #elif defined(USE_SSE) /* * There is no intrinsic to do this on 8-bit lanes, so shift right in each @@ -326,6 +363,16 @@ shift_right(const pg_u8x16_t v, const int n) #endif } +/* + * For little endian machines, the prevN functions need to do byte swapping + * here. Is there a way to avoid this? + */ +#ifdef WORDS_BIGENDIAN +#define rev(a) (a) +#else +#define rev(a) ((pg_u8x16_t) (vec_reve((_v16qu) (a)))) +#endif + /* * Shift entire 'input' register right by N 8-bit lanes, and * replace the first N lanes with the last N lanes from the @@ -341,6 +388,8 @@ prev1(pg_u8x16_t prev, pg_u8x16_t input) { #if defined(USE_NEON) return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 1); +#elif defined(USE_ALTIVEC) + return rev(vec_sld(rev(prev), rev(input), sizeof(pg_u8x16_t) - 1)); #elif defined(USE_SSE) return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 1); #endif @@ -351,6 +400,8 @@ prev2(pg_u8x16_t prev, pg_u8x16_t input) { #if defined(USE_NEON) return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 2); +#elif defined(USE_ALTIVEC) + return rev(vec_sld(rev(prev), rev(input), sizeof(pg_u8x16_t) - 2)); #elif defined(USE_SSE) return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 2); #endif @@ -361,6 +412,8 @@ prev3(pg_u8x16_t prev, pg_u8x16_t input) { #if defined(USE_NEON) return vextq_u8(prev, input, sizeof(pg_u8x16_t) - 3); +#elif defined(USE_ALTIVEC) + return rev(vec_sld(rev(prev), rev(input), sizeof(pg_u8x16_t) - 3)); #elif defined(USE_SSE) return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 3); #endif @@ -375,6 +428,8 @@ lookup(const pg_u8x16_t input, const pg_u8x16_t lookup) { #if defined(USE_NEON) return vqtbl1q_u8(lookup, input); +#elif defined(USE_ALTIVEC) + return vec_perm(lookup, vzero(), input); #elif defined(USE_SSE) return _mm_shuffle_epi8(lookup, input); #endif @@ -488,6 +543,8 @@ to_bool(const pg_u8x16_t v) { #if defined(USE_NEON) return vmaxvq_u8(v) != 0; +#elif defined(USE_ALTIVEC) + return !vec_all_eq(v, vzero()); #elif defined(USE_SSE) /* * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is @@ -503,6 +560,8 @@ check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error) { #if defined(USE_NEON) const pg_u8x16_t cmp = vceqq_u8(v, vzero()); +#elif defined(USE_ALTIVEC) + const pg_u8x16_t cmp = (pg_u8x16_t) vec_cmpeq(v, vzero()); #elif defined(USE_SSE) const pg_u8x16_t cmp = _mm_cmpeq_epi8(v, vzero()); #endif @@ -516,6 +575,8 @@ is_highbit_set(const pg_u8x16_t v) { #if defined(USE_NEON) return vmaxvq_u8(v) > 0x7F; +#elif defined(USE_ALTIVEC) + return to_bool(vec_and(v, splat(0x80))); #elif defined(USE_SSE) return _mm_movemask_epi8(v) != 0; #endif -- 2.30.2