v3 applies on top of the v9 json_lex_string patch in [1] and adds a bit more to that, resulting in a simpler patch that is more amenable to additional SIMD-capable platforms.
[1] https://www.postgresql.org/message-id/CAFBsxsFV4v802idV0-Bo%3DV7wLMHRbOZ4er0hgposhyGCikmVGA%40mail.gmail.com -- John Naylor EDB: http://www.enterprisedb.com
diff --git a/src/common/wchar.c b/src/common/wchar.c index 1e6e198bf2..1ca7533f00 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -1918,11 +1918,12 @@ pg_utf8_verifystr(const unsigned char *s, int len) const int orig_len = len; uint32 state = BGN; -/* - * Sixteen seems to give the best balance of performance across different - * byte distributions. - */ -#define STRIDE_LENGTH 16 + /* + * With a stride of two vector widths, gcc will unroll the loop. Even if + * the compiler can unroll a longer loop, it's not worth it because we + * must fall back to the byte-wise algorithm if we find any non-ASCII. + */ +#define STRIDE_LENGTH (2 * sizeof(Vector8)) if (len >= STRIDE_LENGTH) { diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 011b0b3abd..aea045aa66 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -19,6 +19,8 @@ #ifndef PG_WCHAR_H #define PG_WCHAR_H +#include "port/simd.h" + /* * The pg_wchar type */ @@ -704,25 +706,28 @@ extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len); * Verify a chunk of bytes for valid ASCII. * * Returns false if the input contains any zero bytes or bytes with the - * high-bit set. Input len must be a multiple of 8. + * high-bit set. Input len must be a multiple of the chunk size (8 or 16). */ static inline bool is_valid_ascii(const unsigned char *s, int len) { const unsigned char *const s_end = s + len; - uint64 chunk, - highbit_cum = UINT64CONST(0), - zero_cum = UINT64CONST(0x8080808080808080); + Vector8 chunk; + Vector8 highbit_cum = vector8_broadcast(0); +#ifdef USE_NO_SIMD + Vector8 zero_cum = vector8_broadcast(0x80); +#endif Assert(len % sizeof(chunk) == 0); while (s < s_end) { - memcpy(&chunk, s, sizeof(chunk)); + vector8_load(&chunk, s); + + /* Capture any zero bytes in this chunk. */ +#if defined(USE_NO_SIMD) /* - * Capture any zero bytes in this chunk. - * * First, add 0x7f to each byte. This sets the high bit in each byte, * unless it was a zero. If any resulting high bits are zero, the * corresponding high bits in the zero accumulator will be cleared. @@ -734,20 +739,31 @@ is_valid_ascii(const unsigned char *s, int len) * because we check for those separately. */ zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f)); +#else + + /* + * Set all bits in each lane of the highbit accumulator where input + * bytes are zero. + */ + highbit_cum = vector8_or(highbit_cum, + vector8_eq(chunk, vector8_broadcast(0))); +#endif /* Capture all set bits in this chunk. */ - highbit_cum |= chunk; + highbit_cum = vector8_or(highbit_cum, chunk); s += sizeof(chunk); } /* Check if any high bits in the high bit accumulator got set. */ - if (highbit_cum & UINT64CONST(0x8080808080808080)) + if (vector8_is_highbit_set(highbit_cum)) return false; +#ifdef USE_NO_SIMD /* Check if any high bits in the zero accumulator got cleared. */ - if (zero_cum != UINT64CONST(0x8080808080808080)) + if (zero_cum != vector8_broadcast(0x80)) return false; +#endif return true; } diff --git a/src/include/port/simd.h b/src/include/port/simd.h index 56df989094..8f85153110 100644 --- a/src/include/port/simd.h +++ b/src/include/port/simd.h @@ -38,6 +38,7 @@ typedef __m128i Vector8; * If no SIMD instructions are available, we can in some cases emulate vector * operations using bitwise operations on unsigned integers. */ +#define USE_NO_SIMD typedef uint64 Vector8; #endif @@ -47,7 +48,11 @@ static inline Vector8 vector8_broadcast(const uint8 c); static inline bool vector8_has_zero(const Vector8 v); static inline bool vector8_has(const Vector8 v, const uint8 c); static inline bool vector8_has_le(const Vector8 v, const uint8 c); - +static inline bool vector8_is_highbit_set(const Vector8 v); +static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2); +#ifndef USE_NO_SIMD +static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2); +#endif /* * Functions for loading a chunk of memory into a vector. @@ -181,4 +186,38 @@ vector8_has_le(const Vector8 v, const uint8 c) return result; } +static inline bool +vector8_is_highbit_set(const Vector8 v) +{ +#ifdef USE_SSE2 + return _mm_movemask_epi8(v) != 0; +#else + return v & vector8_broadcast(0x80); +#endif +} + +/* comparisons between vectors */ + +#ifndef USE_NO_SIMD +static inline Vector8 +vector8_eq(const Vector8 v1, const Vector8 v2) +{ +#ifdef USE_SSE2 + return _mm_cmpeq_epi8(v1, v2); +#endif +} +#endif + +/* bitwise operations */ + +static inline Vector8 +vector8_or(const Vector8 v1, const Vector8 v2) +{ +#ifdef USE_SSE2 + return _mm_or_si128(v1, v2); +#else + return v1 | v2; +#endif +} + #endif /* SIMD_H */