On 07/02/2021 22:24, John Naylor wrote:
Here is a more polished version of the function pointer approach, now adapted to all multibyte encodings. Using the not-yet-committed tests from [1], I found a thinko bug that resulted in the test for nul bytes to not only be wrong, but probably also elided by the compiler. Doing it correctly is noticeably slower on pure ascii, but still several times faster than before, so the conclusions haven't changed any. I'll run full measurements later this week, but I'll share the patch now for review.

As a quick test, I hacked up pg_utf8_verifystr() to use Lemire's algorithm from the simdjson library [1], see attached patch. I microbenchmarked it using the the same test I used before [2].

These results are with "gcc -O2" using "gcc (Debian 10.2.1-6) 10.2.1 20210110"

unpatched master:

postgres=# \i mbverifystr-speed.sql
CREATE FUNCTION
 mixed | ascii
-------+-------
   728 |   393
(1 row)

v1-0001-Add-an-ASCII-fast-path-to-multibyte-encoding-veri.patch:

 mixed | ascii
-------+-------
   759 |    98
(1 row)

simdjson-utf8-hack.patch:

 mixed | ascii
-------+-------
    53 |    31
(1 row)

So clearly that algorithm is fast. Not sure if it has a high startup cost, or large code size, or other tradeoffs that we don't want. At least it depends on SIMD instructions, so it requires more code for the architecture-specific implementations and autoconf logic and all that. Nevertheless I think it deserves a closer look, I'm a bit reluctant to put in half-way measures, when there's a clearly superior algorithm out there.

I also tested the fallback implementation from the simdjson library (included in the patch, if you uncomment it in simdjson-glue.c):

 mixed | ascii
-------+-------
   447 |    46
(1 row)

I think we should at least try to adopt that. At a high level, it looks pretty similar your patch: you load the data 8 bytes at a time, check if there are all ASCII. If there are any non-ASCII chars, you check the bytes one by one, otherwise you load the next 8 bytes. Your patch should be able to achieve the same performance, if done right. I don't think the simdjson code forbids \0 bytes, so that will add a few cycles, but still.

[1] https://github.com/simdjson/simdjson
[2] https://www.postgresql.org/message-id/06d45421-61b8-86dd-e765-f1ce527a5...@iki.fi

- Heikki

PS. Your patch as it stands isn't safe on systems with strict alignment, the string passed to the verify function isn't guaranteed to be 8 bytes aligned. Use memcpy to fetch the next 8-byte chunk to fix.

diff --git a/src/backend/Makefile b/src/backend/Makefile
index 9672e2cb43a..aa79526b0cf 100644
--- a/src/backend/Makefile
+++ b/src/backend/Makefile
@@ -54,6 +54,9 @@ ifeq ($(with_systemd),yes)
 LIBS += -lsystemd
 endif
 
+LIBS += -lsimdjson
+
+
 ##########################################################################
 
 all: submake-libpgport submake-catalog-headers submake-utils-headers postgres $(POSTGRES_IMP)
@@ -63,7 +66,7 @@ ifneq ($(PORTNAME), win32)
 ifneq ($(PORTNAME), aix)
 
 postgres: $(OBJS)
-	$(CC) $(CFLAGS) $(call expand_subsys,$^) $(LDFLAGS) $(LDFLAGS_EX) $(export_dynamic) $(LIBS) -o $@
+	$(CXX) $(CFLAGS) $(call expand_subsys,$^) $(LDFLAGS) $(LDFLAGS_EX) $(export_dynamic) $(LIBS) -o $@
 
 endif
 endif
diff --git a/src/common/Makefile b/src/common/Makefile
index 5422579a6a2..f3e4d985062 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -92,6 +92,9 @@ OBJS_COMMON += \
 	sha2.o
 endif
 
+simdjson-glue_srv.o:
+	$(CXX) $(CXXFLAGS) -c simdjson-glue.c -o simdjson-glue_srv.o
+
 # A few files are currently only built for frontend, not server
 # (Mkvcbuild.pm has a copy of this list, too).  logging.c is excluded
 # from OBJS_FRONTEND_SHLIB (shared library) as a matter of policy,
@@ -110,6 +113,9 @@ OBJS_FRONTEND = \
 OBJS_SHLIB = $(OBJS_FRONTEND_SHLIB:%.o=%_shlib.o)
 OBJS_SRV = $(OBJS_COMMON:%.o=%_srv.o)
 
+# Only use the simdjson version in the server
+OBJS_SRV += simdjson-glue_srv.o
+
 # where to find gen_keywordlist.pl and subsidiary files
 TOOLSDIR = $(top_srcdir)/src/tools
 GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
diff --git a/src/common/simdjson-glue.c b/src/common/simdjson-glue.c
new file mode 100644
index 00000000000..9c300bde022
--- /dev/null
+++ b/src/common/simdjson-glue.c
@@ -0,0 +1,90 @@
+#include <simdjson.h>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+// Heikki: This is copy-pasted from simdjson library
+// src/fallback/dom_parser_implementation.cpp
+//
+// Note: Apache licensed!
+
+// credit: based on code from Google Fuchsia (Apache Licensed)
+static bool fallback_validate_utf8(const char *buf, size_t len)
+{
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  uint64_t pos = 0;
+  uint32_t code_point = 0;
+  while (pos < len) {
+    // check of the next 8 bytes are ascii.
+    uint64_t next_pos = pos + 16;
+    if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v1;
+      memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        pos = next_pos;
+        continue;
+      }
+    }
+    unsigned char byte = data[pos];
+    if (byte < 0b10000000) {
+      pos++;
+      continue;
+    } else if ((byte & 0b11100000) == 0b11000000) {
+      next_pos = pos + 2;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) { return false; }
+    } else if ((byte & 0b11110000) == 0b11100000) {
+      next_pos = pos + 3;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point = (byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if (code_point < 0x800 || 0xffff < code_point ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return false;
+      }
+    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+      next_pos = pos + 4;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point =
+          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
+    } else {
+      // we may have a continuation
+      return false;
+    }
+    pos = next_pos;
+  }
+  return true;
+}
+
+
+extern "C" {
+
+bool
+simdjson_validate_utf8(const char *s, int len)
+{
+
+	// uncomment to test simdjson's fallback implementation
+	//return fallback_validate_utf8(s, len);
+
+	return simdjson::validate_utf8(s, len);
+
+}
+
+}
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e020..0c77f83ba01 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1758,7 +1758,7 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 }
 
 static int
-pg_utf8_verifystr(const unsigned char *s, int len)
+pg_utf8_verifystr_slow(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
 
@@ -1786,6 +1786,23 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 	return s - start;
 }
 
+
+
+#ifndef FRONTEND
+extern bool simdjson_validate_utf8(const char *s, int len);
+#endif
+
+static int
+pg_utf8_verifystr(const unsigned char *s, int len)
+{
+#ifndef FRONTEND
+	if (true && simdjson_validate_utf8((const char *) s, len))
+		return len;
+	else
+#endif
+		return pg_utf8_verifystr_slow(s, len);
+}
+
 /*
  * Check for validity of a single UTF-8 encoded character
  *

Reply via email to