I wrote:
> We can also shave a
> few percent by having pg_utf8_verifystr use SSE2 for the ascii path. I
> can look into this.

Here's a patch for that. If the input is mostly ascii, I'd expect that
part of the flame graph to shrink by 40-50% and give a small boost
overall.

-- 
John Naylor
EDB: http://www.enterprisedb.com
 src/common/wchar.c                       | 18 ++++--------
 src/include/mb/pg_wchar.h                | 50 ++++++++++++++++++++++++++++++--
 src/test/regress/expected/conversion.out |  3 +-
 src/test/regress/sql/conversion.sql      |  3 +-
 4 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 1e6e198bf2..a305e0e66b 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1918,26 +1918,20 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 	const int	orig_len = len;
 	uint32		state = BGN;
 
-/*
- * Sixteen seems to give the best balance of performance across different
- * byte distributions.
- */
-#define STRIDE_LENGTH 16
-
-	if (len >= STRIDE_LENGTH)
+	if (len >= ASCII_CHECK_LEN)
 	{
-		while (len >= STRIDE_LENGTH)
+		while (len >= ASCII_CHECK_LEN)
 		{
 			/*
 			 * If the chunk is all ASCII, we can skip the full UTF-8 check,
 			 * but we must first check for a non-END state, which means the
 			 * previous chunk ended in the middle of a multibyte sequence.
 			 */
-			if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
-				utf8_advance(s, &state, STRIDE_LENGTH);
+			if (state != END || !is_valid_ascii(s, ASCII_CHECK_LEN))
+				utf8_advance(s, &state, ASCII_CHECK_LEN);
 
-			s += STRIDE_LENGTH;
-			len -= STRIDE_LENGTH;
+			s += ASCII_CHECK_LEN;
+			len -= ASCII_CHECK_LEN;
 		}
 
 		/* The error state persists, so we only need to check for it here. */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 31f5b393da..ca238c212b 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -700,19 +700,64 @@ extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
 #endif
 
 
+/*
+ * Note: We piggy-back on the check for SSE 4.2 intrinsics but only need SSE2 at runtime.
+ * That's supported by all x86-64 hardware, so we don't need an indirect function call.
+ * WIP: put this somewhere useful
+ */
+#if (defined (__x86_64__) || defined(_M_AMD64)) && (defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK))
+#include <nmmintrin.h>
+#define USE_SSE2
+#endif
+
 /*
  * Verify a chunk of bytes for valid ASCII.
  *
  * Returns false if the input contains any zero bytes or bytes with the
- * high-bit set. Input len must be a multiple of 8.
+ * high-bit set. Input len must be a multiple of the chunk size (8 or 16).
  */
 static inline bool
 is_valid_ascii(const unsigned char *s, int len)
 {
+#ifdef USE_SSE2
+	__m128i     chunk,
+				error_cum = _mm_setzero_si128(),
+				zeros;
+
+/*
+ * With two chunks, gcc can unroll the loop, so provide a convenience macro for
+ * callers. Even if the compiler can unroll a longer loop, it's not worth it
+ * because callers might have to use a byte-wise algorithm if we return false.
+ */
+#define ASCII_CHECK_LEN (2 * sizeof(__m128i))
+	Assert(len % sizeof(chunk) == 0);
+
+	while (len > 0)
+	{
+		chunk = _mm_loadu_si128((const __m128i *) s);
+
+		/* Capture all set bits in this chunk. */
+		error_cum = _mm_or_si128(error_cum, chunk);
+
+		/*
+		 * Set all bits in each lane of the error accumulator where input bytes are zero.
+		 */
+		zeros = _mm_cmpeq_epi8(chunk, _mm_setzero_si128());
+		error_cum = _mm_or_si128(error_cum, zeros);
+
+		s += sizeof(chunk);
+		len -= sizeof(chunk);
+	}
+
+	/* Check if any high bits in the error accumulator got set. */
+	return _mm_movemask_epi8(error_cum) == 0;
+
+#else
 	uint64		chunk,
 				highbit_cum = UINT64CONST(0),
 				zero_cum = UINT64CONST(0x8080808080808080);
 
+#define ASCII_CHECK_LEN (2 * sizeof(uint64))
 	Assert(len % sizeof(chunk) == 0);
 
 	while (len > 0)
@@ -734,7 +779,7 @@ is_valid_ascii(const unsigned char *s, int len)
 		 */
 		zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
 
-		/* Capture any set bits in this chunk. */
+		/* Capture all set bits in this chunk. */
 		highbit_cum |= chunk;
 
 		s += sizeof(chunk);
@@ -750,6 +795,7 @@ is_valid_ascii(const unsigned char *s, int len)
 		return false;
 
 	return true;
+#endif /* USE_SSE2 */
 }
 
 #endif							/* PG_WCHAR_H */
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 442e7aff2b..434dc4d93c 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -140,7 +140,8 @@ select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verificatio
 -- will contain all 4 bytes if they are present, so various
 -- expressions below add 3 ASCII bytes to the end to ensure
 -- consistent error messages.
--- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+-- The number 64 below needs to equal or a multiple of the largest
+-- possible value of ASCII_CHECK_LEN in mb/pg_wchar.h.
 -- Test multibyte verification in fast path
 with test_bytes as (
   select
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 9a65fca91f..27ef069eaf 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -121,7 +121,8 @@ select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verificatio
 -- will contain all 4 bytes if they are present, so various
 -- expressions below add 3 ASCII bytes to the end to ensure
 -- consistent error messages.
--- The number 64 below needs to be at least the value of STRIDE_LENGTH in wchar.c.
+-- The number 64 below needs to equal or a multiple of the largest
+-- possible value of ASCII_CHECK_LEN in mb/pg_wchar.h.
 
 -- Test multibyte verification in fast path
 with test_bytes as (

Reply via email to