On Mon, Nov 20, 2023 at 10:50:36AM -0800, Jubilee Young wrote:
> In that case, I took a look across the codebase and saw a
> utils/ascii.h that doesn't
> seem to have gotten much love, but I suppose one could argue that it's 
> intended
> to be a backend-only header file?

That might work.  It's not #included in very many files, so adding
port/simd.h shouldn't be too bad.  And ascii.h is also pretty inexpensive,
so including it in wchar.c seems permissible, too.  I'm not certain this
doesn't cause problems with libpgcommon, but I don't see why it would,
either.

> So it should probably end up living somewhere near the UTF-8 support, and
> the easiest way to make it not go into something pgrx currently
> includes would be
> to make it a new header file, though there's a fair amount of API we
> don't touch.

Does pgrx use ascii.h at all?

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
diff --git a/src/common/wchar.c b/src/common/wchar.c
index fb9d9f5c85..fbac11deb4 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "utils/ascii.h"
 
 
 /*
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 29cd5732f1..80676d9e02 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -22,8 +22,6 @@
 #ifndef PG_WCHAR_H
 #define PG_WCHAR_H
 
-#include "port/simd.h"
-
 /*
  * The pg_wchar type
  */
@@ -722,71 +720,4 @@ extern int	mic2latin_with_table(const unsigned char *mic, unsigned char *p,
 extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
 #endif
 
-
-/*
- * Verify a chunk of bytes for valid ASCII.
- *
- * Returns false if the input contains any zero bytes or bytes with the
- * high-bit set. Input len must be a multiple of the chunk size (8 or 16).
- */
-static inline bool
-is_valid_ascii(const unsigned char *s, int len)
-{
-	const unsigned char *const s_end = s + len;
-	Vector8		chunk;
-	Vector8		highbit_cum = vector8_broadcast(0);
-#ifdef USE_NO_SIMD
-	Vector8		zero_cum = vector8_broadcast(0x80);
-#endif
-
-	Assert(len % sizeof(chunk) == 0);
-
-	while (s < s_end)
-	{
-		vector8_load(&chunk, s);
-
-		/* Capture any zero bytes in this chunk. */
-#ifdef USE_NO_SIMD
-
-		/*
-		 * First, add 0x7f to each byte. This sets the high bit in each byte,
-		 * unless it was a zero. If any resulting high bits are zero, the
-		 * corresponding high bits in the zero accumulator will be cleared.
-		 *
-		 * If none of the bytes in the chunk had the high bit set, the max
-		 * value each byte can have after the addition is 0x7f + 0x7f = 0xfe,
-		 * and we don't need to worry about carrying over to the next byte. If
-		 * any input bytes did have the high bit set, it doesn't matter
-		 * because we check for those separately.
-		 */
-		zero_cum &= (chunk + vector8_broadcast(0x7F));
-#else
-
-		/*
-		 * Set all bits in each lane of the highbit accumulator where input
-		 * bytes are zero.
-		 */
-		highbit_cum = vector8_or(highbit_cum,
-								 vector8_eq(chunk, vector8_broadcast(0)));
-#endif
-
-		/* Capture all set bits in this chunk. */
-		highbit_cum = vector8_or(highbit_cum, chunk);
-
-		s += sizeof(chunk);
-	}
-
-	/* Check if any high bits in the high bit accumulator got set. */
-	if (vector8_is_highbit_set(highbit_cum))
-		return false;
-
-#ifdef USE_NO_SIMD
-	/* Check if any high bits in the zero accumulator got cleared. */
-	if (zero_cum != vector8_broadcast(0x80))
-		return false;
-#endif
-
-	return true;
-}
-
 #endif							/* PG_WCHAR_H */
diff --git a/src/include/utils/ascii.h b/src/include/utils/ascii.h
index 630acd9bfd..7df024dad3 100644
--- a/src/include/utils/ascii.h
+++ b/src/include/utils/ascii.h
@@ -11,6 +11,74 @@
 #ifndef _ASCII_H_
 #define _ASCII_H_
 
+#include "port/simd.h"
+
 extern void ascii_safe_strlcpy(char *dest, const char *src, size_t destsiz);
 
+/*
+ * Verify a chunk of bytes for valid ASCII.
+ *
+ * Returns false if the input contains any zero bytes or bytes with the
+ * high-bit set. Input len must be a multiple of the chunk size (8 or 16).
+ */
+static inline bool
+is_valid_ascii(const unsigned char *s, int len)
+{
+	const unsigned char *const s_end = s + len;
+	Vector8		chunk;
+	Vector8		highbit_cum = vector8_broadcast(0);
+#ifdef USE_NO_SIMD
+	Vector8		zero_cum = vector8_broadcast(0x80);
+#endif
+
+	Assert(len % sizeof(chunk) == 0);
+
+	while (s < s_end)
+	{
+		vector8_load(&chunk, s);
+
+		/* Capture any zero bytes in this chunk. */
+#ifdef USE_NO_SIMD
+
+		/*
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. If any resulting high bits are zero, the
+		 * corresponding high bits in the zero accumulator will be cleared.
+		 *
+		 * If none of the bytes in the chunk had the high bit set, the max
+		 * value each byte can have after the addition is 0x7f + 0x7f = 0xfe,
+		 * and we don't need to worry about carrying over to the next byte. If
+		 * any input bytes did have the high bit set, it doesn't matter
+		 * because we check for those separately.
+		 */
+		zero_cum &= (chunk + vector8_broadcast(0x7F));
+#else
+
+		/*
+		 * Set all bits in each lane of the highbit accumulator where input
+		 * bytes are zero.
+		 */
+		highbit_cum = vector8_or(highbit_cum,
+								 vector8_eq(chunk, vector8_broadcast(0)));
+#endif
+
+		/* Capture all set bits in this chunk. */
+		highbit_cum = vector8_or(highbit_cum, chunk);
+
+		s += sizeof(chunk);
+	}
+
+	/* Check if any high bits in the high bit accumulator got set. */
+	if (vector8_is_highbit_set(highbit_cum))
+		return false;
+
+#ifdef USE_NO_SIMD
+	/* Check if any high bits in the zero accumulator got cleared. */
+	if (zero_cum != vector8_broadcast(0x80))
+		return false;
+#endif
+
+	return true;
+}
+
 #endif							/* _ASCII_H_ */

Reply via email to