On Mon, Nov 20, 2023 at 10:50:36AM -0800, Jubilee Young wrote: > In that case, I took a look across the codebase and saw a > utils/ascii.h that doesn't > seem to have gotten much love, but I suppose one could argue that it's > intended > to be a backend-only header file?
That might work. It's not #included in very many files, so adding port/simd.h shouldn't be too bad. And ascii.h is also pretty inexpensive, so including it in wchar.c seems permissible, too. I'm not certain this doesn't cause problems with libpgcommon, but I don't see why it would, either. > So it should probably end up living somewhere near the UTF-8 support, and > the easiest way to make it not go into something pgrx currently > includes would be > to make it a new header file, though there's a fair amount of API we > don't touch. Does pgrx use ascii.h at all? -- Nathan Bossart Amazon Web Services: https://aws.amazon.com
diff --git a/src/common/wchar.c b/src/common/wchar.c index fb9d9f5c85..fbac11deb4 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -13,6 +13,7 @@ #include "c.h" #include "mb/pg_wchar.h" +#include "utils/ascii.h" /* diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 29cd5732f1..80676d9e02 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -22,8 +22,6 @@ #ifndef PG_WCHAR_H #define PG_WCHAR_H -#include "port/simd.h" - /* * The pg_wchar type */ @@ -722,71 +720,4 @@ extern int mic2latin_with_table(const unsigned char *mic, unsigned char *p, extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len); #endif - -/* - * Verify a chunk of bytes for valid ASCII. - * - * Returns false if the input contains any zero bytes or bytes with the - * high-bit set. Input len must be a multiple of the chunk size (8 or 16). - */ -static inline bool -is_valid_ascii(const unsigned char *s, int len) -{ - const unsigned char *const s_end = s + len; - Vector8 chunk; - Vector8 highbit_cum = vector8_broadcast(0); -#ifdef USE_NO_SIMD - Vector8 zero_cum = vector8_broadcast(0x80); -#endif - - Assert(len % sizeof(chunk) == 0); - - while (s < s_end) - { - vector8_load(&chunk, s); - - /* Capture any zero bytes in this chunk. */ -#ifdef USE_NO_SIMD - - /* - * First, add 0x7f to each byte. This sets the high bit in each byte, - * unless it was a zero. If any resulting high bits are zero, the - * corresponding high bits in the zero accumulator will be cleared. - * - * If none of the bytes in the chunk had the high bit set, the max - * value each byte can have after the addition is 0x7f + 0x7f = 0xfe, - * and we don't need to worry about carrying over to the next byte. If - * any input bytes did have the high bit set, it doesn't matter - * because we check for those separately. - */ - zero_cum &= (chunk + vector8_broadcast(0x7F)); -#else - - /* - * Set all bits in each lane of the highbit accumulator where input - * bytes are zero. - */ - highbit_cum = vector8_or(highbit_cum, - vector8_eq(chunk, vector8_broadcast(0))); -#endif - - /* Capture all set bits in this chunk. */ - highbit_cum = vector8_or(highbit_cum, chunk); - - s += sizeof(chunk); - } - - /* Check if any high bits in the high bit accumulator got set. */ - if (vector8_is_highbit_set(highbit_cum)) - return false; - -#ifdef USE_NO_SIMD - /* Check if any high bits in the zero accumulator got cleared. */ - if (zero_cum != vector8_broadcast(0x80)) - return false; -#endif - - return true; -} - #endif /* PG_WCHAR_H */ diff --git a/src/include/utils/ascii.h b/src/include/utils/ascii.h index 630acd9bfd..7df024dad3 100644 --- a/src/include/utils/ascii.h +++ b/src/include/utils/ascii.h @@ -11,6 +11,74 @@ #ifndef _ASCII_H_ #define _ASCII_H_ +#include "port/simd.h" + extern void ascii_safe_strlcpy(char *dest, const char *src, size_t destsiz); +/* + * Verify a chunk of bytes for valid ASCII. + * + * Returns false if the input contains any zero bytes or bytes with the + * high-bit set. Input len must be a multiple of the chunk size (8 or 16). + */ +static inline bool +is_valid_ascii(const unsigned char *s, int len) +{ + const unsigned char *const s_end = s + len; + Vector8 chunk; + Vector8 highbit_cum = vector8_broadcast(0); +#ifdef USE_NO_SIMD + Vector8 zero_cum = vector8_broadcast(0x80); +#endif + + Assert(len % sizeof(chunk) == 0); + + while (s < s_end) + { + vector8_load(&chunk, s); + + /* Capture any zero bytes in this chunk. */ +#ifdef USE_NO_SIMD + + /* + * First, add 0x7f to each byte. This sets the high bit in each byte, + * unless it was a zero. If any resulting high bits are zero, the + * corresponding high bits in the zero accumulator will be cleared. + * + * If none of the bytes in the chunk had the high bit set, the max + * value each byte can have after the addition is 0x7f + 0x7f = 0xfe, + * and we don't need to worry about carrying over to the next byte. If + * any input bytes did have the high bit set, it doesn't matter + * because we check for those separately. + */ + zero_cum &= (chunk + vector8_broadcast(0x7F)); +#else + + /* + * Set all bits in each lane of the highbit accumulator where input + * bytes are zero. + */ + highbit_cum = vector8_or(highbit_cum, + vector8_eq(chunk, vector8_broadcast(0))); +#endif + + /* Capture all set bits in this chunk. */ + highbit_cum = vector8_or(highbit_cum, chunk); + + s += sizeof(chunk); + } + + /* Check if any high bits in the high bit accumulator got set. */ + if (vector8_is_highbit_set(highbit_cum)) + return false; + +#ifdef USE_NO_SIMD + /* Check if any high bits in the zero accumulator got cleared. */ + if (zero_cum != vector8_broadcast(0x80)) + return false; +#endif + + return true; +} + #endif /* _ASCII_H_ */