v9 is just a rebase.
--
John Naylor
EDB: http://www.enterprisedb.com
From e876049ad3b153e8725ab23f65ae8f021a970470 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Thu, 1 Apr 2021 08:24:05 -0400
Subject: [PATCH v9] Replace pg_utf8_verifystr() with two faster
implementations:
On x86-64, we use SSE 4.1 with a lookup algorithm based on "Validating
UTF-8 In Less Than One Instruction Per Byte" by John Keiser and Daniel
Lemire. Since configure already tests for SSE 4.2 for CRC, we piggy-back
on top of that. The lookup tables are taken from the simdjson library
(Apache 2.0 licensed), but the code is written from scratch using
simdjson as a reference.
On other platforms, we still get a performance boost by using a bespoke
fallback function, rather than one that relies on pg_utf8_verifychar()
and pg_utf8_isvalid(). This one is loosely based on the fallback that
is part of the simdjson library.
---
config/c-compiler.m4 | 28 +-
configure | 114 +++--
configure.ac | 61 ++-
src/Makefile.global.in | 3 +
src/common/wchar.c | 27 +-
src/include/pg_config.h.in | 9 +
src/include/port/pg_utf8.h | 86 ++++
src/port/Makefile | 6 +
src/port/pg_utf8_fallback.c | 129 ++++++
src/port/pg_utf8_sse42.c | 537 +++++++++++++++++++++++
src/port/pg_utf8_sse42_choose.c | 68 +++
src/test/regress/expected/conversion.out | 52 +++
src/test/regress/sql/conversion.sql | 28 ++
src/tools/msvc/Mkvcbuild.pm | 4 +
src/tools/msvc/Solution.pm | 3 +
15 files changed, 1088 insertions(+), 67 deletions(-)
create mode 100644 src/include/port/pg_utf8.h
create mode 100644 src/port/pg_utf8_fallback.c
create mode 100644 src/port/pg_utf8_sse42.c
create mode 100644 src/port/pg_utf8_sse42_choose.c
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 780e906ecc..b1604eac58 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -591,36 +591,46 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
-# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_SSE42_INTRINSICS
# ---------------------------
# Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
# using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
# test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
# the other ones are, on x86-64 platforms)
#
+# While at it, check for support x86 instructions added in SSSE3 and SSE4.1,
+# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128.
+# We should be able to assume these are understood by the compiler if CRC
+# intrinsics are, but it's better to document our reliance on them here.
+#
+# We don't test for SSE2 intrinsics, as they are assumed to be present on
+# x86-64 platforms, which we can easily check at compile time.
+#
# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
-# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
-AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
+# intrinsics are supported, sets pgac_sse42_intrinsics, and CFLAGS_SSE42.
+AC_DEFUN([PGAC_SSE42_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar],
[pgac_save_CFLAGS=$CFLAGS
CFLAGS="$pgac_save_CFLAGS $1"
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
[unsigned int crc = 0;
crc = _mm_crc32_u8(crc, 0);
crc = _mm_crc32_u32(crc, 0);
+ __m128i vec = _mm_set1_epi8(crc);
+ vec = _mm_shuffle_epi8(vec,
+ _mm_alignr_epi8(vec, vec, 1));
/* return computed value, to prevent the above being optimized away */
- return crc == 0;])],
+ return _mm_testz_si128(vec, vec);])],
[Ac_cachevar=yes],
[Ac_cachevar=no])
CFLAGS="$pgac_save_CFLAGS"])
if test x"$Ac_cachevar" = x"yes"; then
CFLAGS_SSE42="$1"
- pgac_sse42_crc32_intrinsics=yes
+ pgac_sse42_intrinsics=yes
fi
undefine([Ac_cachevar])dnl
-])# PGAC_SSE42_CRC32_INTRINSICS
-
+])# PGAC_SSE42_INTRINSICS
# PGAC_ARMV8_CRC32C_INTRINSICS
# ----------------------------
diff --git a/configure b/configure
index 06ad9aeb71..4d70f10fab 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,7 @@ XGETTEXT
MSGMERGE
MSGFMT_FLAGS
MSGFMT
+PG_UTF8_OBJS
PG_CRC32C_OBJS
CFLAGS_ARMV8_CRC32C
CFLAGS_SSE42
@@ -17963,14 +17964,14 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
fi
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
#
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
# with the default compiler flags. If not, check if adding the -msse4.2
# flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; }
+if ${pgac_cv_sse42_intrinsics_+:} false; then :
$as_echo_n "(cached) " >&6
else
pgac_save_CFLAGS=$CFLAGS
@@ -17984,32 +17985,35 @@ main ()
unsigned int crc = 0;
crc = _mm_crc32_u8(crc, 0);
crc = _mm_crc32_u32(crc, 0);
+ __m128i vec = _mm_set1_epi8(crc);
+ vec = _mm_shuffle_epi8(vec,
+ _mm_alignr_epi8(vec, vec, 1));
/* return computed value, to prevent the above being optimized away */
- return crc == 0;
+ return _mm_testz_si128(vec, vec);
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_sse42_crc32_intrinsics_=yes
+ pgac_cv_sse42_intrinsics_=yes
else
- pgac_cv_sse42_crc32_intrinsics_=no
+ pgac_cv_sse42_intrinsics_=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
CFLAGS="$pgac_save_CFLAGS"
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics_" >&5
+$as_echo "$pgac_cv_sse42_intrinsics_" >&6; }
+if test x"$pgac_cv_sse42_intrinsics_" = x"yes"; then
CFLAGS_SSE42=""
- pgac_sse42_crc32_intrinsics=yes
+ pgac_sse42_intrinsics=yes
fi
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5
+$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; }
+if ${pgac_cv_sse42_intrinsics__msse4_2+:} false; then :
$as_echo_n "(cached) " >&6
else
pgac_save_CFLAGS=$CFLAGS
@@ -18023,26 +18027,29 @@ main ()
unsigned int crc = 0;
crc = _mm_crc32_u8(crc, 0);
crc = _mm_crc32_u32(crc, 0);
+ __m128i vec = _mm_set1_epi8(crc);
+ vec = _mm_shuffle_epi8(vec,
+ _mm_alignr_epi8(vec, vec, 1));
/* return computed value, to prevent the above being optimized away */
- return crc == 0;
+ return _mm_testz_si128(vec, vec);
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+ pgac_cv_sse42_intrinsics__msse4_2=yes
else
- pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+ pgac_cv_sse42_intrinsics__msse4_2=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
CFLAGS="$pgac_save_CFLAGS"
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics__msse4_2" >&5
+$as_echo "$pgac_cv_sse42_intrinsics__msse4_2" >&6; }
+if test x"$pgac_cv_sse42_intrinsics__msse4_2" = x"yes"; then
CFLAGS_SSE42="-msse4.2"
- pgac_sse42_crc32_intrinsics=yes
+ pgac_sse42_intrinsics=yes
fi
fi
@@ -18177,12 +18184,12 @@ fi
# in the template or configure command line.
if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
# Use Intel SSE 4.2 if available.
- if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+ if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
USE_SSE42_CRC32C=1
else
# Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
# the runtime check.
- if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+ if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
else
# Use ARM CRC Extension if available.
@@ -18196,7 +18203,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
# fall back to slicing-by-8 algorithm, which doesn't require any
# special CPU support.
USE_SLICING_BY_8_CRC32C=1
- fi
+ fi
fi
fi
fi
@@ -18249,6 +18256,61 @@ $as_echo "slicing-by-8" >&6; }
fi
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+ if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+ USE_SSE42_UTF8=1
+ else
+ # the CPUID instruction is needed for the runtime check.
+ if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+ USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+ else
+ # fall back to algorithm which doesn't require any special
+ # CPU support.
+ USE_FALLBACK_UTF8=1
+ fi
+ fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# Note: We need the fallback for error handling in all builds.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
+$as_echo_n "checking which UTF-8 validator to use... " >&6; }
+if test x"$USE_SSE42_UTF8" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+
+ PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
+$as_echo "SSE 4.2" >&6; }
+else
+ if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+ PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+$as_echo "SSE 4.2 with runtime check" >&6; }
+ else
+
+$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h
+
+ PG_UTF8_OBJS="pg_utf8_fallback.o"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: fallback" >&5
+$as_echo "fallback" >&6; }
+ fi
+fi
+
# Select semaphore implementation type.
if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index 92193f35fb..a67f797c98 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2059,14 +2059,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
fi
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 intrinsics.
#
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# First check if these intrinsics can be used
# with the default compiler flags. If not, check if adding the -msse4.2
# flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
-PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
- PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
+PGAC_SSE42_INTRINSICS([])
+if test x"$pgac_sse42_intrinsics" != x"yes"; then
+ PGAC_SSE42_INTRINSICS([-msse4.2])
fi
AC_SUBST(CFLAGS_SSE42)
@@ -2107,12 +2107,12 @@ AC_SUBST(CFLAGS_ARMV8_CRC32C)
# in the template or configure command line.
if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
# Use Intel SSE 4.2 if available.
- if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+ if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
USE_SSE42_CRC32C=1
else
# Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
# the runtime check.
- if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+ if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
else
# Use ARM CRC Extension if available.
@@ -2126,7 +2126,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
# fall back to slicing-by-8 algorithm, which doesn't require any
# special CPU support.
USE_SLICING_BY_8_CRC32C=1
- fi
+ fi
fi
fi
fi
@@ -2163,6 +2163,51 @@ else
fi
AC_SUBST(PG_CRC32C_OBJS)
+# Select UTF-8 validator implementation.
+#
+# If we are targeting a processor that has SSE 4.2 instructions, we can use
+# those to validate UTF-8 characters. If we're not targeting such
+# a processor, but we can nevertheless produce code that uses the SSE
+# intrinsics, perhaps with some extra CFLAGS, compile both implementations and
+# select which one to use at runtime, depending on whether SSE 4.2 is supported
+# by the processor we're running on.
+#
+# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
+# in the template or configure command line.
+if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+ if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+ USE_SSE42_UTF8=1
+ else
+ # the CPUID instruction is needed for the runtime check.
+ if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+ USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+ else
+ # fall back to algorithm which doesn't require any special
+ # CPU support.
+ USE_FALLBACK_UTF8=1
+ fi
+ fi
+fi
+
+# Set PG_UTF8_OBJS appropriately depending on the selected implementation.
+# Note: We need the fallback for error handling in all builds.
+AC_MSG_CHECKING([which UTF-8 validator to use])
+if test x"$USE_SSE42_UTF8" = x"1"; then
+ AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+ PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+ AC_MSG_RESULT(SSE 4.2)
+else
+ if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+ AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+ PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+ AC_MSG_RESULT(SSE 4.2 with runtime check)
+ else
+ AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+ PG_UTF8_OBJS="pg_utf8_fallback.o"
+ AC_MSG_RESULT(fallback)
+ fi
+fi
+AC_SUBST(PG_UTF8_OBJS)
# Select semaphore implementation type.
if test "$PORTNAME" != "win32"; then
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 74b3a6acd2..1d51ebe9c6 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -721,6 +721,9 @@ LIBOBJS = @LIBOBJS@
# files needed for the chosen CRC-32C implementation
PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
+# files needed for the chosen UTF-8 validation implementation
+PG_UTF8_OBJS = @PG_UTF8_OBJS@
+
LIBS := -lpgcommon -lpgport $(LIBS)
# to make ws2_32.lib the last library
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..37c4d4489b 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
#include "c.h"
#include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
/*
@@ -1760,30 +1761,8 @@ pg_utf8_verifychar(const unsigned char *s, int len)
static int
pg_utf8_verifystr(const unsigned char *s, int len)
{
- const unsigned char *start = s;
-
- while (len > 0)
- {
- int l;
-
- /* fast path for ASCII-subset characters */
- if (!IS_HIGHBIT_SET(*s))
- {
- if (*s == '\0')
- break;
- l = 1;
- }
- else
- {
- l = pg_utf8_verifychar(s, len);
- if (l == -1)
- break;
- }
- s += l;
- len -= l;
- }
-
- return s - start;
+ /* platform-specific implementation in src/port */
+ return UTF8_VERIFYSTR(s, len);
}
/*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 5e2255a2f5..8d5f9114ab 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -920,6 +920,15 @@
/* Define to 1 to build with PAM support. (--with-pam) */
#undef USE_PAM
+/* Define to 1 to use the fallback UTF-8 validator written in C. */
+#undef USE_FALLBACK_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
+#undef USE_SSE42_UTF8
+
+/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */
+#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+
/* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
#undef USE_SLICING_BY_8_CRC32C
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..89132243b0
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ * Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_UTF8)
+/* Use Intel SSE4.2 instructions. */
+#define UTF8_VERIFYSTR(s, len) \
+ pg_validate_utf8_sse42((s), (len))
+
+extern int pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+/*
+ * Use Intel SSE 4.2 instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+#define UTF8_VERIFYSTR(s, len) \
+ pg_validate_utf8((s), (len))
+
+extern int (*pg_validate_utf8) (const unsigned char *s, int len);
+extern int pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#else
+#define UTF8_VERIFYSTR(s, len) \
+ pg_validate_utf8_fallback((s), (len))
+
+#endif /* USE_SSE42_UTF8 */
+
+/* The following need to be visible everywhere. */
+
+extern int pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#define IS_CONTINUATION_BYTE(c) (((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c) (((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c) (((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c) (((c) & 0xF8) == 0xF0)
+
+/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
+#define HAS_ZERO(chunk) ( \
+ ((chunk) - UINT64CONST(0x0101010101010101)) & \
+ ~(chunk) & \
+ UINT64CONST(0x8080808080808080))
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+ uint64 half1,
+ half2,
+ highbits_set;
+
+ if (len >= 2 * sizeof(uint64))
+ {
+ memcpy(&half1, s, sizeof(uint64));
+ memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+
+ /* If there are zero bytes, bail and let the slow path handle it. */
+ if (HAS_ZERO(half1) || HAS_ZERO(half2))
+ return 0;
+
+ /* Check if any bytes in this chunk have the high bit set. */
+ highbits_set = ((half1 | half2) & UINT64CONST(0x8080808080808080));
+
+ if (!highbits_set)
+ return 2 * sizeof(uint64);
+ else
+ return 0;
+ }
+ else
+ return 0;
+}
+
+#endif /* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index 52dbf5783f..04838b0ab2 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS)
OBJS = \
$(LIBOBJS) \
$(PG_CRC32C_OBJS) \
+ $(PG_UTF8_OBJS) \
bsearch_arg.o \
chklocale.o \
erand48.o \
@@ -89,6 +90,11 @@ libpgport.a: $(OBJS)
thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
# all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..1efedc2429
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,129 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ * Validate UTF-8 using plain C.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+ const unsigned char *start = s;
+ unsigned char b1,
+ b2,
+ b3,
+ b4;
+
+ while (len > 0)
+ {
+ int l;
+
+ /* fast path for ASCII-subset characters */
+ l = check_ascii(s, len);
+ if (l)
+ {
+ s += l;
+ len -= l;
+ continue;
+ }
+
+ /* Found non-ASCII or zero above, so verify a single character. */
+ if (!IS_HIGHBIT_SET(*s))
+ {
+ if (*s == '\0')
+ break;
+ l = 1;
+ }
+ /* code points U+0080 through U+07FF */
+ else if (IS_TWO_BYTE_LEAD(*s))
+ {
+ l = 2;
+ if (len < l)
+ break;
+
+ b1 = *s;
+ b2 = *(s + 1);
+
+ if (!IS_CONTINUATION_BYTE(b2))
+ break;
+
+ /* check 2-byte overlong: 1100.000x.10xx.xxxx */
+ if (b1 < 0xC2)
+ break;
+ }
+ /* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+ else if (IS_THREE_BYTE_LEAD(*s))
+ {
+ l = 3;
+ if (len < l)
+ break;
+
+ b1 = *s;
+ b2 = *(s + 1);
+ b3 = *(s + 2);
+
+ if (!IS_CONTINUATION_BYTE(b2) ||
+ !IS_CONTINUATION_BYTE(b3))
+ break;
+
+ /* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+ if (b1 == 0xE0 && b2 < 0xA0)
+ break;
+
+ /* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+ if (b1 == 0xED && b2 > 0x9F)
+ break;
+ }
+ /* code points U+010000 through U+10FFFF */
+ else if (IS_FOUR_BYTE_LEAD(*s))
+ {
+ l = 4;
+ if (len < l)
+ break;
+
+ b1 = *s;
+ b2 = *(s + 1);
+ b3 = *(s + 2);
+ b4 = *(s + 3);
+
+ if (!IS_CONTINUATION_BYTE(b2) ||
+ !IS_CONTINUATION_BYTE(b3) ||
+ !IS_CONTINUATION_BYTE(b4))
+ break;
+
+ /*
+ * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+ */
+ if (b1 == 0xF0 && b2 < 0x90)
+ break;
+
+ /* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+ if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+ break;
+ }
+ else
+ /* invalid byte */
+ break;
+
+ s += l;
+ len -= l;
+ }
+
+ return s - start;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..fc7596940a
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,537 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42.c
+ * Validate UTF-8 using Intel SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/port/pg_utf8_sse42.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+#include "port/pg_utf8.h"
+
+/*
+ * This module is based on the paper "Validating UTF-8 In Less Than One
+ * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090
+ * [cs.DB], 10 Oct 2020.
+ *
+ * The authors provide an implementation of this algorithm
+ * in the simdjson library (Apache 2.0 license) found at
+ * https://github.com/simdjson/simdjson. Even if it were practical to
+ * use this library directly, we cannot because it simply returns valid
+ * or not valid, and we need to return the number of valid bytes found
+ * before the first invalid one.
+ *
+ * Therefore, the PG code was written from scratch, but with some idioms
+ * and naming conventions adapted from the Westmere implementation of
+ * simdjson. The constants and lookup tables were taken directly from
+ * simdjson with some cosmetic rearrangements.
+ *
+ * The core of the lookup algorithm is a two-part process:
+ *
+ * 1. Classify 2-byte sequences. All 2-byte errors can be found by looking
+ * at the first three nibbles of each overlapping 2-byte sequence,
+ * using three separate lookup tables. The interesting bytes are either
+ * definite errors or two continuation bytes in a row. The latter may
+ * be valid depending on what came before.
+ *
+ * 2. Find starts of possible 3- and 4-byte sequences.
+ *
+ * Combining the above results allows us to verify any UTF-8 sequence.
+ */
+
+
+/* constants for comparing bytes */
+#define MAX_CONTINUATION 0xBF
+#define MAX_TWO_BYTE_LEAD 0xDF
+#define MAX_THREE_BYTE_LEAD 0xEF
+
+/* lookup tables for classifying two-byte sequences */
+
+/*
+ * 11______ 0_______
+ * 11______ 11______
+ */
+#define TOO_SHORT (1 << 0)
+
+/* 0_______ 10______ */
+#define TOO_LONG (1 << 1)
+
+/* 1100000_ 10______ */
+#define OVERLONG_2 (1 << 2)
+
+/* 11100000 100_____ */
+#define OVERLONG_3 (1 << 3)
+
+/* The following two symbols intentionally share the same value. */
+
+/* 11110000 1000____ */
+#define OVERLONG_4 (1 << 4)
+
+/*
+ * 11110101 1000____
+ * 1111011_ 1000____
+ * 11111___ 1000____
+ */
+#define TOO_LARGE_1000 (1 << 4)
+
+/*
+ * 11110100 1001____
+ * 11110100 101_____
+ * 11110101 1001____
+ * 11110101 101_____
+ * 1111011_ 1001____
+ * 1111011_ 101_____
+ * 11111___ 1001____
+ * 11111___ 101_____
+ */
+#define TOO_LARGE (1 << 5)
+
+/* 11101101 101_____ */
+#define SURROGATE (1 << 6)
+
+/*
+ * 10______ 10______
+ *
+ * The cast here is to silence warnings about implicit conversion
+ * from 'int' to 'char'. It's fine that this is a negative value,
+ * because we only care about the pattern of bits.
+ */
+#define TWO_CONTS ((char) (1 << 7))
+
+/* These all have ____ in byte 1 */
+#define CARRY (TOO_SHORT | TOO_LONG | TWO_CONTS)
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_HIGH_TABLE \
+ /* 0_______ ________ <ASCII in byte 1> */ \
+ TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+ TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \
+ /* 10______ ________ <continuation in byte 1> */ \
+ TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, \
+ /* 1100____ ________ <two byte lead in byte 1> */ \
+ TOO_SHORT | OVERLONG_2, \
+ /* 1101____ ________ <two byte lead in byte 1> */ \
+ TOO_SHORT, \
+ /* 1110____ ________ <three byte lead in byte 1> */ \
+ TOO_SHORT | OVERLONG_3 | SURROGATE, \
+ /* 1111____ ________ <four+ byte lead in byte 1> */ \
+ TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
+
+/*
+ * table for categorizing bits in the low nibble of
+ * the first byte of a 2-byte sequence
+ */
+#define BYTE_1_LOW_TABLE \
+ /* ____0000 ________ */ \
+ CARRY | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, \
+ /* ____0001 ________ */ \
+ CARRY | OVERLONG_2, \
+ /* ____001_ ________ */ \
+ CARRY, \
+ CARRY, \
+ /* ____0100 ________ */ \
+ CARRY | TOO_LARGE, \
+ /* ____0101 ________ */ \
+ CARRY | TOO_LARGE | TOO_LARGE_1000, \
+ /* ____011_ ________ */ \
+ CARRY | TOO_LARGE | TOO_LARGE_1000, \
+ CARRY | TOO_LARGE | TOO_LARGE_1000, \
+ /* ____1___ ________ */ \
+ CARRY | TOO_LARGE | TOO_LARGE_1000, \
+ CARRY | TOO_LARGE | TOO_LARGE_1000, \
+ CARRY | TOO_LARGE | TOO_LARGE_1000, \
+ CARRY | TOO_LARGE | TOO_LARGE_1000, \
+ CARRY | TOO_LARGE | TOO_LARGE_1000, \
+ /* ____1101 ________ */ \
+ CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, \
+ CARRY | TOO_LARGE | TOO_LARGE_1000, \
+ CARRY | TOO_LARGE | TOO_LARGE_1000
+
+/*
+ * table for categorizing bits in the high nibble of
+ * the second byte of a 2-byte sequence
+ */
+#define BYTE_2_HIGH_TABLE \
+ /* ________ 0_______ <ASCII in byte 2> */ \
+ TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+ TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \
+ /* ________ 1000____ */ \
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, \
+ /* ________ 1001____ */ \
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, \
+ /* ________ 101_____ */ \
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+ TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \
+ /* ________ 11______ */ \
+ TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT \
+
+
+/* helper functions to wrap intrinsics */
+
+#define vset(...) _mm_setr_epi8(__VA_ARGS__)
+
+/* return a zeroed register */
+static inline const __m128i
+vzero()
+{
+ return _mm_setzero_si128();
+}
+
+/* perform an unaligned load from memory into a register */
+static inline const __m128i
+vload(const unsigned char *raw_input)
+{
+ return _mm_loadu_si128((const __m128i *) raw_input);
+}
+
+/* return a vector with each 8-bit lane populated with the input scalar */
+static inline __m128i
+splat(char byte)
+{
+ return _mm_set1_epi8(byte);
+}
+
+/* perform signed greater-than on all 8-bit lanes */
+static inline __m128i
+greater_than(const __m128i v1, const __m128i v2)
+{
+ return _mm_cmpgt_epi8(v1, v2);
+}
+
+/* bitwise vector operations */
+static inline __m128i
+bitwise_and(const __m128i v1, const __m128i v2)
+{
+ return _mm_and_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_or(const __m128i v1, const __m128i v2)
+{
+ return _mm_or_si128(v1, v2);
+}
+
+static inline __m128i
+bitwise_xor(const __m128i v1, const __m128i v2)
+{
+ return _mm_xor_si128(v1, v2);
+}
+
+/*
+ * Do unsigned subtraction, but instead of wrapping around
+ * on overflow, stop at zero. Useful for emulating unsigned
+ * comparison.
+ */
+static inline __m128i
+saturating_sub(const __m128i v1, const __m128i v2)
+{
+ return _mm_subs_epu8(v1, v2);
+}
+
+/*
+ * Shift right each 8-bit lane
+ *
+ * There is no intrinsic to do this on 8-bit lanes, so shift right in each
+ * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
+ */
+static inline __m128i
+shift_right(const __m128i v, const int n)
+{
+ const __m128i shift16 = _mm_srli_epi16(v, n);
+ const __m128i mask = splat(0xFF >> n);
+
+ return bitwise_and(shift16, mask);
+}
+
+/*
+ * Shift entire 'input' register right by N 8-bit lanes, and
+ * replace the first N lanes with the last N lanes from the
+ * 'prev' register. Could be stated in C thusly:
+ *
+ * ((prev << 128) | input) >> (N * 8)
+ *
+ * The third argument to the intrinsic must be a numeric constant, so
+ * we must have separate functions for different shift amounts.
+ */
+static inline __m128i
+prev1(__m128i prev, __m128i input)
+{
+ return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+}
+
+static inline __m128i
+prev2(__m128i prev, __m128i input)
+{
+ return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+}
+
+static inline __m128i
+prev3(__m128i prev, __m128i input)
+{
+ return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+}
+
+/*
+ * For each 8-bit lane in the input, use that value as an index
+ * into the lookup vector as if it were a 16-element byte array.
+ */
+static inline __m128i
+lookup(const __m128i input, const __m128i lookup)
+{
+ return _mm_shuffle_epi8(lookup, input);
+}
+
+/*
+ * Return a vector with lanes non-zero where we have either errors, or
+ * two or more continuations in a row.
+ */
+static inline __m128i
+check_special_cases(const __m128i prev, const __m128i input)
+{
+ const __m128i byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
+ const __m128i byte_1_low_table = vset(BYTE_1_LOW_TABLE);
+ const __m128i byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
+
+ /*
+ * To classify the first byte in each chunk we need to have the last byte
+ * from the previous chunk.
+ */
+ const __m128i input_shift1 = prev1(prev, input);
+
+ /* put the relevant nibbles into their own bytes in their own registers */
+ const __m128i byte_1_high = shift_right(input_shift1, 4);
+ const __m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F));
+ const __m128i byte_2_high = shift_right(input, 4);
+
+ /* lookup the possible errors for each set of nibbles */
+ const __m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table);
+ const __m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table);
+ const __m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table);
+
+ /*
+ * AND all the lookups together. At this point, non-zero lanes in the
+ * returned vector represent:
+ *
+ * 1. invalid 2-byte sequences
+ *
+ * 2. the second continuation byte of a 3- or 4-byte character
+ *
+ * 3. the third continuation byte of a 4-byte character
+ */
+ const __m128i temp = bitwise_and(lookup_1_high, lookup_1_low);
+
+ return bitwise_and(temp, lookup_2_high);
+}
+
+/*
+ * Return a vector with lanes set to TWO_CONTS where we expect to find two
+ * continuations in a row. These are valid only within 3- and 4-byte sequences.
+ */
+static inline __m128i
+check_multibyte_lengths(const __m128i prev, const __m128i input)
+{
+ /*
+ * Populate registers that contain the input shifted right by 2 and 3
+ * bytes, filling in the left lanes from the previous input.
+ */
+ const __m128i input_shift2 = prev2(prev, input);
+ const __m128i input_shift3 = prev3(prev, input);
+
+ /*
+ * Constants for comparison. Any 3-byte lead is greater than
+ * MAX_TWO_BYTE_LEAD, etc.
+ */
+ const __m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD);
+ const __m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD);
+
+ /*
+ * Look in the shifted registers for 3- or 4-byte leads. There is no
+ * unsigned comparison, so we use saturating subtraction followed by
+ * signed comparison with zero. Any non-zero bytes in the result represent
+ * valid leads.
+ */
+ const __m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
+ const __m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+
+ /* OR them together for easier comparison */
+ const __m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+
+ /*
+ * Set all bits in each 8-bit lane if the result is greater than zero.
+ * Signed arithmetic is okay because the values are small.
+ */
+ const __m128i must23 = greater_than(temp, vzero());
+
+ /*
+ * We want to compare with the result of check_special_cases() so apply a
+ * mask to return only the set bits corresponding to the "two
+ * continuations" case.
+ */
+ return bitwise_and(must23, splat(TWO_CONTS));
+}
+
+/* set bits in the error vector where we find invalid UTF-8 input */
+static inline void
+check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+{
+ const __m128i special_cases = check_special_cases(prev, input);
+ const __m128i expect_two_conts = check_multibyte_lengths(prev, input);
+
+ /* If the two cases are identical, this will be zero. */
+ const __m128i result = bitwise_xor(expect_two_conts, special_cases);
+
+ *error = bitwise_or(*error, result);
+}
+
+/* return false if a register is zero, true otherwise */
+static inline bool
+to_bool(const __m128i v)
+{
+ /*
+ * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
+ * zero. Zero is the only value whose bitwise AND with itself is zero.
+ */
+ return !_mm_testz_si128(v, v);
+}
+
+/* set bits in the error vector where bytes in the input are zero */
+static inline void
+check_for_zeros(const __m128i v, __m128i * error)
+{
+ const __m128i cmp = _mm_cmpeq_epi8(v, vzero());
+
+ *error = bitwise_or(*error, cmp);
+}
+
+/* vector version of IS_HIGHBIT_SET() */
+static inline bool
+is_highbit_set(const __m128i v)
+{
+ return _mm_movemask_epi8(v) != 0;
+}
+
+/* return non-zero if the input terminates with an incomplete code point */
+static inline __m128i
+is_incomplete(const __m128i v)
+{
+ const __m128i max_array =
+ vset(0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, MAX_THREE_BYTE_LEAD, MAX_TWO_BYTE_LEAD, MAX_CONTINUATION);
+
+ return saturating_sub(v, max_array);
+}
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+ const unsigned char *start = s;
+ const int orig_len = len;
+ __m128i error = vzero();
+ __m128i prev = vzero();
+ __m128i prev_incomplete = vzero();
+ __m128i input;
+
+ while (len > sizeof(__m128i))
+ {
+ input = vload(s);
+
+ check_for_zeros(input, &error);
+
+ /*
+ * If the chunk is all ASCII, we can skip the full UTF-8 check, but we
+ * must still check the previous chunk for incomplete multibyte
+ * sequences at the end. We only update prev_incomplete if the chunk
+ * contains non-ASCII, since the error is cumulative.
+ */
+ if (!is_highbit_set(input))
+ error = bitwise_or(error, prev_incomplete);
+ else
+ {
+ check_utf8_bytes(prev, input, &error);
+ prev_incomplete = is_incomplete(input);
+ }
+
+ prev = input;
+ s += sizeof(__m128i);
+ len -= sizeof(__m128i);
+ }
+
+ /*
+ * If we saw an error any time during the loop, start over with the
+ * fallback so we can return the number of valid bytes.
+ */
+ if (to_bool(error))
+ return pg_validate_utf8_fallback(start, orig_len);
+ else
+ {
+ unsigned char inbuf[sizeof(__m128i)];
+
+ /*
+ * Back-fill the remainder with some kind of ASCII so that we have a
+ * whole register. Normally we memset buffers with zero, but if we did
+ * that, we couldn't reuse our check for zero bytes using vector
+ * operations.
+ */
+ memset(inbuf, 0x20, sizeof(__m128i));
+ memcpy(inbuf, s, len);
+
+ input = vload(inbuf);
+
+ check_for_zeros(input, &error);
+ check_utf8_bytes(prev, input, &error);
+
+ /*
+ * We must also check that the remainder does not end with an
+ * incomplete code point. This would only slip past check_utf8_bytes()
+ * if the remainder is 16 bytes in length, but it's not worth adding a
+ * branch for that.
+ */
+ error = bitwise_or(error, is_incomplete(input));
+
+ if (to_bool(error))
+ {
+ /*
+ * If we encounter errors in the remainder, we need to be a bit
+ * more careful, since it's possible that the end of the input
+ * falls within a multibyte sequence, and we don't want to repeat
+ * the work we've already done. In that case, we just walk
+ * backwards into the previous chunk, if any, to find the last
+ * byte that could have been the start of a character. For short
+ * strings, this will start over from the beginning, but that's
+ * fine.
+ */
+ while (s > start)
+ {
+ s--;
+ len++;
+
+ if ((!IS_HIGHBIT_SET(*s) && *s != '\0') ||
+ IS_TWO_BYTE_LEAD(*s) ||
+ IS_THREE_BYTE_LEAD(*s) ||
+ IS_FOUR_BYTE_LEAD(*s))
+ break;
+ }
+ return orig_len - len + pg_validate_utf8_fallback(s, len);
+ }
+ else
+ return orig_len;
+ }
+}
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c
new file mode 100644
index 0000000000..ff6120be2b
--- /dev/null
+++ b/src/port/pg_utf8_sse42_choose.c
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ * Choose between Intel SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel SSE
+ * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/port/pg_utf8_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+ /* To save from checking every SSE2 intrinsic, insist on 64-bit. */
+#ifdef __x86_64__
+ unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+ __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+ __cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif /* HAVE__GET_CPUID */
+ return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */
+
+#else
+ return false;
+#endif /* __x86_64__ */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+ if (pg_utf8_sse42_available())
+ pg_validate_utf8 = pg_validate_utf8_sse42;
+ else
+ pg_validate_utf8 = pg_validate_utf8_fallback;
+
+ return pg_validate_utf8(s, len);
+}
+
+int (*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index e34ab20974..e37bda8057 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,58 @@ $$;
--
-- UTF-8
--
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs values
+ ('\xaf', 'bare continuation'),
+ ('\xc5', 'missing second byte in 2-byte char'),
+ ('\xc080', 'smallest 2-byte overlong'),
+ ('\xc1bf', 'largest 2-byte overlong'),
+ ('\xc280', 'next 2-byte after overlongs'),
+ ('\xdfbf', 'largest 2-byte'),
+ ('\xe9af', 'missing third byte in 3-byte char'),
+ ('\xe08080', 'smallest 3-byte overlong'),
+ ('\xe09fbf', 'largest 3-byte overlong'),
+ ('\xe0a080', 'next 3-byte after overlong'),
+ ('\xed9fbf', 'last before surrogates'),
+ ('\xeda080', 'smallest surrogate'),
+ ('\xedbfbf', 'largest surrogate'),
+ ('\xee8080', 'next after surrogates'),
+ ('\xefbfbf', 'largest 3-byte'),
+ ('\xf1afbf', 'missing fourth byte in 4-byte char'),
+ ('\xf0808080', 'smallest 4-byte overlong'),
+ ('\xf08fbfbf', 'largest 4-byte overlong'),
+ ('\xf0908080', 'next 4-byte after overlong'),
+ ('\xf48fbfbf', 'largest 4-byte'),
+ ('\xf4908080', 'smallest too large'),
+ ('\xfa9a9a8a8a', '5 byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+ description | result | errorat | error
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation | \x | \xaf | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x | \xc5 | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong | \x | \xc080 | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong | \x | \xc1bf | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs | \xc280 | |
+ largest 2-byte | \xdfbf | |
+ missing third byte in 3-byte char | \x | \xe9af | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong | \x | \xe08080 | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong | \x | \xe09fbf | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong | \xe0a080 | |
+ last before surrogates | \xed9fbf | |
+ smallest surrogate | \x | \xeda080 | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate | \x | \xedbfbf | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates | \xee8080 | |
+ largest 3-byte | \xefbfbf | |
+ missing fourth byte in 4-byte char | \x | \xf1afbf | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong | \x | \xf0808080 | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong | \x | \xf08fbfbf | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong | \xf0908080 | |
+ largest 4-byte | \xf48fbfbf | |
+ smallest too large | \x | \xf4908080 | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5 byte | \x | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(22 rows)
+
CREATE TABLE utf8_inputs (inbytes bytea, description text);
insert into utf8_inputs values
('\x666f6f', 'valid, pure ASCII'),
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index ea85f20ed8..7f761cd630 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,34 @@ $$;
--
-- UTF-8
--
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs values
+ ('\xaf', 'bare continuation'),
+ ('\xc5', 'missing second byte in 2-byte char'),
+ ('\xc080', 'smallest 2-byte overlong'),
+ ('\xc1bf', 'largest 2-byte overlong'),
+ ('\xc280', 'next 2-byte after overlongs'),
+ ('\xdfbf', 'largest 2-byte'),
+ ('\xe9af', 'missing third byte in 3-byte char'),
+ ('\xe08080', 'smallest 3-byte overlong'),
+ ('\xe09fbf', 'largest 3-byte overlong'),
+ ('\xe0a080', 'next 3-byte after overlong'),
+ ('\xed9fbf', 'last before surrogates'),
+ ('\xeda080', 'smallest surrogate'),
+ ('\xedbfbf', 'largest surrogate'),
+ ('\xee8080', 'next after surrogates'),
+ ('\xefbfbf', 'largest 3-byte'),
+ ('\xf1afbf', 'missing fourth byte in 4-byte char'),
+ ('\xf0808080', 'smallest 4-byte overlong'),
+ ('\xf08fbfbf', 'largest 4-byte overlong'),
+ ('\xf0908080', 'next 4-byte after overlong'),
+ ('\xf48fbfbf', 'largest 4-byte'),
+ ('\xf4908080', 'smallest too large'),
+ ('\xfa9a9a8a8a', '5 byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
CREATE TABLE utf8_inputs (inbytes bytea, description text);
insert into utf8_inputs values
('\x666f6f', 'valid, pure ASCII'),
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index bc65185130..be17fd3ab4 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -114,10 +114,14 @@ sub mkvcbuild
push(@pgportfiles, 'pg_crc32c_sse42_choose.c');
push(@pgportfiles, 'pg_crc32c_sse42.c');
push(@pgportfiles, 'pg_crc32c_sb8.c');
+ push(@pgportfiles, 'pg_utf8_sse42_choose.c');
+ push(@pgportfiles, 'pg_utf8_sse42.c');
+ push(@pgportfiles, 'pg_utf8_fallback.c');
}
else
{
push(@pgportfiles, 'pg_crc32c_sb8.c');
+ push(@pgportfiles, 'pg_utf8_fallback.c');
}
our @pgcommonallfiles = qw(
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 710f26f8ab..de3c62af7c 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -494,6 +494,9 @@ sub GenerateFiles
USE_NAMED_POSIX_SEMAPHORES => undef,
USE_OPENSSL => undef,
USE_PAM => undef,
+ USE_FALLBACK_UTF8 => undef,
+ USE_SSE42_UTF8 => undef,
+ USE_SSE42_UTF8_WITH_RUNTIME_CHECK => 1,
USE_SLICING_BY_8_CRC32C => undef,
USE_SSE42_CRC32C => undef,
USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,
--
2.22.0