v3 applies on top of the v9 json_lex_string patch in [1] and adds a
bit more to that, resulting in a simpler patch that is more amenable
to additional SIMD-capable platforms.

[1] 
https://www.postgresql.org/message-id/CAFBsxsFV4v802idV0-Bo%3DV7wLMHRbOZ4er0hgposhyGCikmVGA%40mail.gmail.com

-- 
John Naylor
EDB: http://www.enterprisedb.com
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 1e6e198bf2..1ca7533f00 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1918,11 +1918,12 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 	const int	orig_len = len;
 	uint32		state = BGN;
 
-/*
- * Sixteen seems to give the best balance of performance across different
- * byte distributions.
- */
-#define STRIDE_LENGTH 16
+	/*
+	 * With a stride of two vector widths, gcc will unroll the loop. Even if
+	 * the compiler can unroll a longer loop, it's not worth it because we
+	 * must fall back to the byte-wise algorithm if we find any non-ASCII.
+	 */
+#define STRIDE_LENGTH (2 * sizeof(Vector8))
 
 	if (len >= STRIDE_LENGTH)
 	{
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 011b0b3abd..aea045aa66 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -19,6 +19,8 @@
 #ifndef PG_WCHAR_H
 #define PG_WCHAR_H
 
+#include "port/simd.h"
+
 /*
  * The pg_wchar type
  */
@@ -704,25 +706,28 @@ extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
  * Verify a chunk of bytes for valid ASCII.
  *
  * Returns false if the input contains any zero bytes or bytes with the
- * high-bit set. Input len must be a multiple of 8.
+ * high-bit set. Input len must be a multiple of the chunk size (8 or 16).
  */
 static inline bool
 is_valid_ascii(const unsigned char *s, int len)
 {
 	const unsigned char *const s_end = s + len;
-	uint64		chunk,
-				highbit_cum = UINT64CONST(0),
-				zero_cum = UINT64CONST(0x8080808080808080);
+	Vector8		chunk;
+	Vector8		highbit_cum = vector8_broadcast(0);
+#ifdef USE_NO_SIMD
+	Vector8		zero_cum = vector8_broadcast(0x80);
+#endif
 
 	Assert(len % sizeof(chunk) == 0);
 
 	while (s < s_end)
 	{
-		memcpy(&chunk, s, sizeof(chunk));
+		vector8_load(&chunk, s);
+
+		/* Capture any zero bytes in this chunk. */
+#if defined(USE_NO_SIMD)
 
 		/*
-		 * Capture any zero bytes in this chunk.
-		 *
 		 * First, add 0x7f to each byte. This sets the high bit in each byte,
 		 * unless it was a zero. If any resulting high bits are zero, the
 		 * corresponding high bits in the zero accumulator will be cleared.
@@ -734,20 +739,31 @@ is_valid_ascii(const unsigned char *s, int len)
 		 * because we check for those separately.
 		 */
 		zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+#else
+
+		/*
+		 * Set all bits in each lane of the highbit accumulator where input
+		 * bytes are zero.
+		 */
+		highbit_cum = vector8_or(highbit_cum,
+								 vector8_eq(chunk, vector8_broadcast(0)));
+#endif
 
 		/* Capture all set bits in this chunk. */
-		highbit_cum |= chunk;
+		highbit_cum = vector8_or(highbit_cum, chunk);
 
 		s += sizeof(chunk);
 	}
 
 	/* Check if any high bits in the high bit accumulator got set. */
-	if (highbit_cum & UINT64CONST(0x8080808080808080))
+	if (vector8_is_highbit_set(highbit_cum))
 		return false;
 
+#ifdef USE_NO_SIMD
 	/* Check if any high bits in the zero accumulator got cleared. */
-	if (zero_cum != UINT64CONST(0x8080808080808080))
+	if (zero_cum != vector8_broadcast(0x80))
 		return false;
+#endif
 
 	return true;
 }
diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 56df989094..8f85153110 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -38,6 +38,7 @@ typedef __m128i Vector8;
  * If no SIMD instructions are available, we can in some cases emulate vector
  * operations using bitwise operations on unsigned integers.
  */
+#define USE_NO_SIMD
 typedef uint64 Vector8;
 #endif
 
@@ -47,7 +48,11 @@ static inline Vector8 vector8_broadcast(const uint8 c);
 static inline bool vector8_has_zero(const Vector8 v);
 static inline bool vector8_has(const Vector8 v, const uint8 c);
 static inline bool vector8_has_le(const Vector8 v, const uint8 c);
-
+static inline bool vector8_is_highbit_set(const Vector8 v);
+static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
+#ifndef USE_NO_SIMD
+static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
+#endif
 
 /*
  * Functions for loading a chunk of memory into a vector.
@@ -181,4 +186,38 @@ vector8_has_le(const Vector8 v, const uint8 c)
 	return result;
 }
 
+static inline bool
+vector8_is_highbit_set(const Vector8 v)
+{
+#ifdef USE_SSE2
+	return _mm_movemask_epi8(v) != 0;
+#else
+	return v & vector8_broadcast(0x80);
+#endif
+}
+
+/* comparisons between vectors */
+
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_eq(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+	return _mm_cmpeq_epi8(v1, v2);
+#endif
+}
+#endif
+
+/* bitwise operations */
+
+static inline Vector8
+vector8_or(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+	return _mm_or_si128(v1, v2);
+#else
+	return v1 | v2;
+#endif
+}
+
 #endif							/* SIMD_H */

Reply via email to