Re: [POC] verifying UTF-8 using SIMD instructions

Heikki Linnakangas Mon, 08 Feb 2021 02:17:30 -0800

On 07/02/2021 22:24, John Naylor wrote:

Here is a more polished version of the function pointer approach, nowadapted to all multibyte encodings. Using the not-yet-committed testsfrom [1], I found a thinko bug that resulted in the test for nul bytesto not only be wrong, but probably also elided by the compiler. Doing itcorrectly is noticeably slower on pure ascii, but still several timesfaster than before, so the conclusions haven't changed any. I'll runfull measurements later this week, but I'll share the patch now for review.

As a quick test, I hacked up pg_utf8_verifystr() to use Lemire'salgorithm from the simdjson library [1], see attached patch. Imicrobenchmarked it using the the same test I used before [2].

These results are with "gcc -O2" using "gcc (Debian 10.2.1-6) 10.2.120210110"


unpatched master:

postgres=# \i mbverifystr-speed.sql
CREATE FUNCTION
 mixed | ascii
-------+-------
   728 |   393
(1 row)

v1-0001-Add-an-ASCII-fast-path-to-multibyte-encoding-veri.patch:

 mixed | ascii
-------+-------
   759 |    98
(1 row)

simdjson-utf8-hack.patch:

 mixed | ascii
-------+-------
    53 |    31
(1 row)

So clearly that algorithm is fast. Not sure if it has a high startupcost, or large code size, or other tradeoffs that we don't want. Atleast it depends on SIMD instructions, so it requires more code for thearchitecture-specific implementations and autoconf logic and all that.Nevertheless I think it deserves a closer look, I'm a bit reluctant toput in half-way measures, when there's a clearly superior algorithm outthere.

I also tested the fallback implementation from the simdjson library(included in the patch, if you uncomment it in simdjson-glue.c):


 mixed | ascii
-------+-------
   447 |    46
(1 row)

I think we should at least try to adopt that. At a high level, it lookspretty similar your patch: you load the data 8 bytes at a time, check ifthere are all ASCII. If there are any non-ASCII chars, you check thebytes one by one, otherwise you load the next 8 bytes. Your patch shouldbe able to achieve the same performance, if done right. I don't thinkthe simdjson code forbids \0 bytes, so that will add a few cycles, butstill.


[1] https://github.com/simdjson/simdjson

[2]https://www.postgresql.org/message-id/[email protected]


- Heikki

PS. Your patch as it stands isn't safe on systems with strict alignment,the string passed to the verify function isn't guaranteed to be 8 bytesaligned. Use memcpy to fetch the next 8-byte chunk to fix.

diff --git a/src/backend/Makefile b/src/backend/Makefile
index 9672e2cb43a..aa79526b0cf 100644
--- a/src/backend/Makefile
+++ b/src/backend/Makefile
@@ -54,6 +54,9 @@ ifeq ($(with_systemd),yes)
 LIBS += -lsystemd
 endif
 
+LIBS += -lsimdjson
+
+
 ##########################################################################
 
 all: submake-libpgport submake-catalog-headers submake-utils-headers postgres $(POSTGRES_IMP)
@@ -63,7 +66,7 @@ ifneq ($(PORTNAME), win32)
 ifneq ($(PORTNAME), aix)
 
 postgres: $(OBJS)
-	$(CC) $(CFLAGS) $(call expand_subsys,$^) $(LDFLAGS) $(LDFLAGS_EX) $(export_dynamic) $(LIBS) -o $@
+	$(CXX) $(CFLAGS) $(call expand_subsys,$^) $(LDFLAGS) $(LDFLAGS_EX) $(export_dynamic) $(LIBS) -o $@
 
 endif
 endif
diff --git a/src/common/Makefile b/src/common/Makefile
index 5422579a6a2..f3e4d985062 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -92,6 +92,9 @@ OBJS_COMMON += \
 	sha2.o
 endif
 
+simdjson-glue_srv.o:
+	$(CXX) $(CXXFLAGS) -c simdjson-glue.c -o simdjson-glue_srv.o
+
 # A few files are currently only built for frontend, not server
 # (Mkvcbuild.pm has a copy of this list, too).  logging.c is excluded
 # from OBJS_FRONTEND_SHLIB (shared library) as a matter of policy,
@@ -110,6 +113,9 @@ OBJS_FRONTEND = \
 OBJS_SHLIB = $(OBJS_FRONTEND_SHLIB:%.o=%_shlib.o)
 OBJS_SRV = $(OBJS_COMMON:%.o=%_srv.o)
 
+# Only use the simdjson version in the server
+OBJS_SRV += simdjson-glue_srv.o
+
 # where to find gen_keywordlist.pl and subsidiary files
 TOOLSDIR = $(top_srcdir)/src/tools
 GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
diff --git a/src/common/simdjson-glue.c b/src/common/simdjson-glue.c
new file mode 100644
index 00000000000..9c300bde022
--- /dev/null
+++ b/src/common/simdjson-glue.c
@@ -0,0 +1,90 @@
+#include <simdjson.h>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+// Heikki: This is copy-pasted from simdjson library
+// src/fallback/dom_parser_implementation.cpp
+//
+// Note: Apache licensed!
+
+// credit: based on code from Google Fuchsia (Apache Licensed)
+static bool fallback_validate_utf8(const char *buf, size_t len)
+{
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  uint64_t pos = 0;
+  uint32_t code_point = 0;
+  while (pos < len) {
+    // check of the next 8 bytes are ascii.
+    uint64_t next_pos = pos + 16;
+    if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v1;
+      memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        pos = next_pos;
+        continue;
+      }
+    }
+    unsigned char byte = data[pos];
+    if (byte < 0b10000000) {
+      pos++;
+      continue;
+    } else if ((byte & 0b11100000) == 0b11000000) {
+      next_pos = pos + 2;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) { return false; }
+    } else if ((byte & 0b11110000) == 0b11100000) {
+      next_pos = pos + 3;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point = (byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if (code_point < 0x800 || 0xffff < code_point ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return false;
+      }
+    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+      next_pos = pos + 4;
+      if (next_pos > len) { return false; }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
+      // range check
+      code_point =
+          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
+    } else {
+      // we may have a continuation
+      return false;
+    }
+    pos = next_pos;
+  }
+  return true;
+}
+
+
+extern "C" {
+
+bool
+simdjson_validate_utf8(const char *s, int len)
+{
+
+	// uncomment to test simdjson's fallback implementation
+	//return fallback_validate_utf8(s, len);
+
+	return simdjson::validate_utf8(s, len);
+
+}
+
+}
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e020..0c77f83ba01 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -1758,7 +1758,7 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 }
 
 static int
-pg_utf8_verifystr(const unsigned char *s, int len)
+pg_utf8_verifystr_slow(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
 
@@ -1786,6 +1786,23 @@ pg_utf8_verifystr(const unsigned char *s, int len)
 	return s - start;
 }
 
+
+
+#ifndef FRONTEND
+extern bool simdjson_validate_utf8(const char *s, int len);
+#endif
+
+static int
+pg_utf8_verifystr(const unsigned char *s, int len)
+{
+#ifndef FRONTEND
+	if (true && simdjson_validate_utf8((const char *) s, len))
+		return len;
+	else
+#endif
+		return pg_utf8_verifystr_slow(s, len);
+}
+
 /*
  * Check for validity of a single UTF-8 encoded character
  *

Re: [POC] verifying UTF-8 using SIMD instructions

Reply via email to