On 07/02/2021 22:24, John Naylor wrote:
Here is a more polished version of the function pointer approach, now
adapted to all multibyte encodings. Using the not-yet-committed tests
from [1], I found a thinko bug that resulted in the test for nul bytes
to not only be wrong, but probably also elided by the compiler. Doing it
correctly is noticeably slower on pure ascii, but still several times
faster than before, so the conclusions haven't changed any. I'll run
full measurements later this week, but I'll share the patch now for review.
As a quick test, I hacked up pg_utf8_verifystr() to use Lemire's
algorithm from the simdjson library [1], see attached patch. I
microbenchmarked it using the the same test I used before [2].
These results are with "gcc -O2" using "gcc (Debian 10.2.1-6) 10.2.1
20210110"
unpatched master:
postgres=# \i mbverifystr-speed.sql
CREATE FUNCTION
mixed | ascii
-------+-------
728 | 393
(1 row)
v1-0001-Add-an-ASCII-fast-path-to-multibyte-encoding-veri.patch:
mixed | ascii
-------+-------
759 | 98
(1 row)
simdjson-utf8-hack.patch:
mixed | ascii
-------+-------
53 | 31
(1 row)
So clearly that algorithm is fast. Not sure if it has a high startup
cost, or large code size, or other tradeoffs that we don't want. At
least it depends on SIMD instructions, so it requires more code for the
architecture-specific implementations and autoconf logic and all that.
Nevertheless I think it deserves a closer look, I'm a bit reluctant to
put in half-way measures, when there's a clearly superior algorithm out
there.
I also tested the fallback implementation from the simdjson library
(included in the patch, if you uncomment it in simdjson-glue.c):
mixed | ascii
-------+-------
447 | 46
(1 row)
I think we should at least try to adopt that. At a high level, it looks
pretty similar your patch: you load the data 8 bytes at a time, check if
there are all ASCII. If there are any non-ASCII chars, you check the
bytes one by one, otherwise you load the next 8 bytes. Your patch should
be able to achieve the same performance, if done right. I don't think
the simdjson code forbids \0 bytes, so that will add a few cycles, but
still.
[1] https://github.com/simdjson/simdjson
[2]
https://www.postgresql.org/message-id/06d45421-61b8-86dd-e765-f1ce527a5...@iki.fi
- Heikki
PS. Your patch as it stands isn't safe on systems with strict alignment,
the string passed to the verify function isn't guaranteed to be 8 bytes
aligned. Use memcpy to fetch the next 8-byte chunk to fix.
diff --git a/src/backend/Makefile b/src/backend/Makefile
index 9672e2cb43a..aa79526b0cf 100644
--- a/src/backend/Makefile
+++ b/src/backend/Makefile
@@ -54,6 +54,9 @@ ifeq ($(with_systemd),yes)
LIBS += -lsystemd
endif
+LIBS += -lsimdjson
+
+
##########################################################################
all: submake-libpgport submake-catalog-headers submake-utils-headers postgres $(POSTGRES_IMP)
@@ -63,7 +66,7 @@ ifneq ($(PORTNAME), win32)
ifneq ($(PORTNAME), aix)
postgres: $(OBJS)
- $(CC) $(CFLAGS) $(call expand_subsys,$^) $(LDFLAGS) $(LDFLAGS_EX) $(export_dynamic) $(LIBS) -o $@
+ $(CXX) $(CFLAGS) $(call expand_subsys,$^) $(LDFLAGS) $(LDFLAGS_EX) $(export_dynamic) $(LIBS) -o $@
endif
endif
diff --git a/src/common/Makefile b/src/common/Makefile
index 5422579a6a2..f3e4d985062 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -92,6 +92,9 @@ OBJS_COMMON += \
sha2.o
endif
+simdjson-glue_srv.o:
+ $(CXX) $(CXXFLAGS) -c simdjson-glue.c -o simdjson-glue_srv.o
+
# A few files are currently only built for frontend, not server
# (Mkvcbuild.pm has a copy of this list, too). logging.c is excluded
# from OBJS_FRONTEND_SHLIB (shared library) as a matter of policy,
@@ -110,6 +113,9 @@ OBJS_FRONTEND = \
OBJS_SHLIB = $(OBJS_FRONTEND_SHLIB:%.o=%_shlib.o)
OBJS_SRV = $(OBJS_COMMON:%.o=%_srv.o)
+# Only use the simdjson version in the server
+OBJS_SRV += simdjson-glue_srv.o
+
# where to find gen_keywordlist.pl and subsidiary files
TOOLSDIR = $(top_srcdir)/src/tools
GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
diff --git a/src/common/simdjson-glue.c b/src/common/simdjson-glue.c
new file mode 100644
index 00000000000..9c300bde022
--- /dev/null
+++ b/src/common/simdjson-glue.c
@@ -0,0 +1,90 @@
+#include <simdjson.h>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+// Heikki: This is copy-pasted from simdjson library
+// src/fallback/dom_parser_implementation.cpp
+//
+// Note: Apache licensed!
+
+// credit: based on code from Google Fuchsia (Apache Licensed)
+static bool fallback_validate_utf8(const char *buf, size_t len)
+{
+ const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+ uint64_t pos = 0;
+ uint32_t code_point = 0;
+ while (pos < len) {
+ // check of the next 8 bytes are ascii.
+ uint64_t next_pos = pos + 16;
+ if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+ uint64_t v1;
+ memcpy(&v1, data + pos, sizeof(uint64_t));
+ uint64_t v2;
+ memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+ uint64_t v{v1 | v2};
+ if ((v & 0x8080808080808080) == 0) {
+ pos = next_pos;
+ continue;
+ }
+ }
+ unsigned char byte = data[pos];
+ if (byte < 0b10000000) {
+ pos++;
+ continue;
+ } else if ((byte & 0b11100000) == 0b11000000) {
+ next_pos = pos + 2;
+ if (next_pos > len) { return false; }
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+ // range check
+ code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+ if (code_point < 0x80 || 0x7ff < code_point) { return false; }
+ } else if ((byte & 0b11110000) == 0b11100000) {
+ next_pos = pos + 3;
+ if (next_pos > len) { return false; }
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+ if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+ // range check
+ code_point = (byte & 0b00001111) << 12 |
+ (data[pos + 1] & 0b00111111) << 6 |
+ (data[pos + 2] & 0b00111111);
+ if (code_point < 0x800 || 0xffff < code_point ||
+ (0xd7ff < code_point && code_point < 0xe000)) {
+ return false;
+ }
+ } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+ next_pos = pos + 4;
+ if (next_pos > len) { return false; }
+ if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+ if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+ if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
+ // range check
+ code_point =
+ (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+ (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+ if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
+ } else {
+ // we may have a continuation
+ return false;
+ }
+ pos = next_pos;
+ }
+ return true;
+}
+
+
+extern "C" {
+
+bool
+simdjson_validate_utf8(const char *s, int len)
+{
+
+ // uncomment to test simdjson's fallback implementation
+ //return fallback_validate_utf8(s, len);
+
+ return simdjson::validate_utf8(s, len);
+
+}
+
+}
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e020..0c77f83ba01 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1758,7 +1758,7 @@ pg_utf8_verifychar(const unsigned char *s, int len)
}
static int
-pg_utf8_verifystr(const unsigned char *s, int len)
+pg_utf8_verifystr_slow(const unsigned char *s, int len)
{
const unsigned char *start = s;
@@ -1786,6 +1786,23 @@ pg_utf8_verifystr(const unsigned char *s, int len)
return s - start;
}
+
+
+#ifndef FRONTEND
+extern bool simdjson_validate_utf8(const char *s, int len);
+#endif
+
+static int
+pg_utf8_verifystr(const unsigned char *s, int len)
+{
+#ifndef FRONTEND
+ if (true && simdjson_validate_utf8((const char *) s, len))
+ return len;
+ else
+#endif
+ return pg_utf8_verifystr_slow(s, len);
+}
+
/*
* Check for validity of a single UTF-8 encoded character
*