I wrote: > We can also shave a > few percent by having pg_utf8_verifystr use SSE2 for the ascii path. I > can look into this.
Here's a patch for that. If the input is mostly ascii, I'd expect that part of the flame graph to shrink by 40-50% and give a small boost overall. -- John Naylor EDB: http://www.enterprisedb.com
src/common/wchar.c | 18 ++++-------- src/include/mb/pg_wchar.h | 50 ++++++++++++++++++++++++++++++-- src/test/regress/expected/conversion.out | 3 +- src/test/regress/sql/conversion.sql | 3 +- 4 files changed, 58 insertions(+), 16 deletions(-) diff --git a/src/common/wchar.c b/src/common/wchar.c index 1e6e198bf2..a305e0e66b 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -1918,26 +1918,20 @@ pg_utf8_verifystr(const unsigned char *s, int len) const int orig_len = len; uint32 state = BGN; -/* - * Sixteen seems to give the best balance of performance across different - * byte distributions. - */ -#define STRIDE_LENGTH 16 - - if (len >= STRIDE_LENGTH) + if (len >= ASCII_CHECK_LEN) { - while (len >= STRIDE_LENGTH) + while (len >= ASCII_CHECK_LEN) { /* * If the chunk is all ASCII, we can skip the full UTF-8 check, * but we must first check for a non-END state, which means the * previous chunk ended in the middle of a multibyte sequence. */ - if (state != END || !is_valid_ascii(s, STRIDE_LENGTH)) - utf8_advance(s, &state, STRIDE_LENGTH); + if (state != END || !is_valid_ascii(s, ASCII_CHECK_LEN)) + utf8_advance(s, &state, ASCII_CHECK_LEN); - s += STRIDE_LENGTH; - len -= STRIDE_LENGTH; + s += ASCII_CHECK_LEN; + len -= ASCII_CHECK_LEN; } /* The error state persists, so we only need to check for it here. */ diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 31f5b393da..ca238c212b 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -700,19 +700,64 @@ extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len); #endif +/* + * Note: We piggy-back on the check for SSE 4.2 intrinsics but only need SSE2 at runtime. + * That's supported by all x86-64 hardware, so we don't need an indirect function call. + * WIP: put this somewhere useful + */ +#if (defined (__x86_64__) || defined(_M_AMD64)) && (defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)) +#include <nmmintrin.h> +#define USE_SSE2 +#endif + /* * Verify a chunk of bytes for valid ASCII. * * Returns false if the input contains any zero bytes or bytes with the - * high-bit set. Input len must be a multiple of 8. + * high-bit set. Input len must be a multiple of the chunk size (8 or 16). */ static inline bool is_valid_ascii(const unsigned char *s, int len) { +#ifdef USE_SSE2 + __m128i chunk, + error_cum = _mm_setzero_si128(), + zeros; + +/* + * With two chunks, gcc can unroll the loop, so provide a convenience macro for + * callers. Even if the compiler can unroll a longer loop, it's not worth it + * because callers might have to use a byte-wise algorithm if we return false. + */ +#define ASCII_CHECK_LEN (2 * sizeof(__m128i)) + Assert(len % sizeof(chunk) == 0); + + while (len > 0) + { + chunk = _mm_loadu_si128((const __m128i *) s); + + /* Capture all set bits in this chunk. */ + error_cum = _mm_or_si128(error_cum, chunk); + + /* + * Set all bits in each lane of the error accumulator where input bytes are zero. + */ + zeros = _mm_cmpeq_epi8(chunk, _mm_setzero_si128()); + error_cum = _mm_or_si128(error_cum, zeros); + + s += sizeof(chunk); + len -= sizeof(chunk); + } + + /* Check if any high bits in the error accumulator got set. */ + return _mm_movemask_epi8(error_cum) == 0; + +#else uint64 chunk, highbit_cum = UINT64CONST(0), zero_cum = UINT64CONST(0x8080808080808080); +#define ASCII_CHECK_LEN (2 * sizeof(uint64)) Assert(len % sizeof(chunk) == 0); while (len > 0) @@ -734,7 +779,7 @@ is_valid_ascii(const unsigned char *s, int len) */ zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f)); - /* Capture any set bits in this chunk. */ + /* Capture all set bits in this chunk. */ highbit_cum |= chunk; s += sizeof(chunk); @@ -750,6 +795,7 @@ is_valid_ascii(const unsigned char *s, int len) return false; return true; +#endif /* USE_SSE2 */ } #endif /* PG_WCHAR_H */ diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index 442e7aff2b..434dc4d93c 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -140,7 +140,8 @@ select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verificatio -- will contain all 4 bytes if they are present, so various -- expressions below add 3 ASCII bytes to the end to ensure -- consistent error messages. --- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c. +-- The number 64 below needs to equal or a multiple of the largest +-- possible value of ASCII_CHECK_LEN in mb/pg_wchar.h. -- Test multibyte verification in fast path with test_bytes as ( select diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index 9a65fca91f..27ef069eaf 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -121,7 +121,8 @@ select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verificatio -- will contain all 4 bytes if they are present, so various -- expressions below add 3 ASCII bytes to the end to ensure -- consistent error messages. --- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c. +-- The number 64 below needs to equal or a multiple of the largest +-- possible value of ASCII_CHECK_LEN in mb/pg_wchar.h. -- Test multibyte verification in fast path with test_bytes as (