speed up text_position() for utf-8

John Naylor Mon, 13 Dec 2021 08:03:52 -0800

Hi,

Commit 9556aa01c69 (Use single-byte Boyer-Moore-Horspool search even
with multibyte encodings), was a speed improvement for the majority of
cases, but when the match was toward the end of the string, the
slowdown in text_position_get_match_pos() was noticeable. It was found
that there was a lot of overhead in pg_mblen(). [1]


The attached exploratory PoC improves this for utf-8. It applies on
top v25 of my utf-8 verification patch in [2], since one approach
relies on the DFA from it. The other three approaches are:
- a version of pg_utf_mblen() that uses a lookup table [3]
- an inlined copy of pg_utf_mblen()
- an ascii fast path with a fallback to the inlined copy of pg_utf_mblen()

The test is attached and the test function is part of the patch. It's
based on the test used in the commit above. The test searches for a
string that's at the end of a ~1 million byte string. This is on gcc
11 with 2-3 runs to ensure repeatability, but I didn't bother with
statistics because the differences are pretty big:

                  patch                  | no match | ascii | mulitbyte
-----------------------------------------+----------+-------+-----------
 PG11                                    |     1120 |  1100 |       900
 master                                  |      381 |  2350 |      1900
 DFA                                     |      386 |  1640 |      1640
 branchless utf mblen                    |      387 |  4100 |      2600
 inline pg_utf_mblen()                   |      380 |  1080 |       920
 inline pg_utf_mblen() + ascii fast path |      382 |   470 |       918

Neither of the branchless approaches worked well. The DFA can't work
as well here as in verification because it must do additional work.
Inlining pg_utf_mblen() restores worst-case performance to PG11
levels. The ascii fast path is a nice improvement on top of that. A
similar approach could work for pg_mbstrlen() as well, but I haven't
looked into that yet. There are other callers of pg_mblen(), but I
haven't looked into whether they are performance-sensitive. A more
general application would be preferable to a targeted one.

[1] 
https://www.postgresql.org/message-id/b65df3d8-1f59-3bd7-ebbe-68b81d5a76a4%40iki.fi
[2] 
https://www.postgresql.org/message-id/CAFBsxsHG%3Dg6W8Mie%2B_NO8dV6O0pO2stxrnS%3Dme5ZmGqk--fd5g%40mail.gmail.com
[3] https://github.com/skeeto/branchless-utf8/blob/master/utf8.h

--
John Naylor
EDB: http://www.enterprisedb.com

 src/backend/utils/adt/varlena.c | 112 ++++++++++++++++++++++++++++++++++++++++
 src/common/wchar.c              |  90 ++------------------------------
 src/include/mb/pg_wchar.h       |  53 -------------------
 src/test/regress/regress.c      |  18 +++++++
 4 files changed, 133 insertions(+), 140 deletions(-)

diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index bd3091bbfb..cc93818007 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -26,6 +26,7 @@
 #include "common/unicode_norm.h"
 #include "lib/hyperloglog.h"
 #include "libpq/pqformat.h"
+#include "mb/pg_utf8.h"
 #include "miscadmin.h"
 #include "nodes/execnodes.h"
 #include "parser/scansup.h"
@@ -1468,6 +1469,116 @@ text_position_get_match_pos(TextPositionState *state)
 {
 	if (!state->is_multibyte)
 		return state->last_match - state->str1 + 1;
+	else if (GetDatabaseEncoding() == PG_UTF8)
+	{
+#if 0	/* dfa */
+
+		int			utf8_state_count[UTF8_LAST_STATE + 1] = {0};
+		uint32		dfa_state = BGN;
+
+		/*
+		 * See pg_utf8.h for an explanation of the state machine. Unlike in
+		 * pg_wchar.c, we must calculate the exact state so we can use it as
+		 * an array index.
+		 */
+		while (state->refpoint < state->last_match)
+		{
+			const unsigned char *s = (const unsigned char *) state->refpoint++;
+
+			dfa_state = (Utf8Transition[*s] >> dfa_state) & 31;
+			utf8_state_count[dfa_state]++;
+		}
+		Assert(state->refpoint == state->last_match);
+		state->refpos += utf8_state_count[END];
+		return state->refpos + 1;
+
+#endif
+#if 0	/* like pg_utf_mblen(), but different algorithm: */
+
+		static const char lengths[] = {
+			1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+			1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 1
+		};
+
+		while (state->refpoint < state->last_match)
+		{
+			const unsigned char *s = (const unsigned char *) state->refpoint;
+
+			state->refpoint += lengths[s[0] >> 3];
+			state->refpos++;
+		}
+
+		Assert(state->refpoint == state->last_match);
+		return state->refpos + 1;
+#endif
+#if 0	/* inline pg_utf_mblen()*/
+
+		int len = 0;
+
+		while (state->refpoint < state->last_match)
+		{
+			const unsigned char *s = (const unsigned char *) state->refpoint;
+
+			if ((*s & 0x80) == 0)
+				len = 1;
+			else if ((*s & 0xe0) == 0xc0)
+				len = 2;
+			else if ((*s & 0xf0) == 0xe0)
+				len = 3;
+			else if ((*s & 0xf8) == 0xf0)
+				len = 4;
+			else
+				len = 1;
+
+			state->refpoint += len;
+			state->refpos++;
+		}
+
+		Assert(state->refpoint == state->last_match);
+		return state->refpos + 1;
+#endif
+#if 1	/* try ASCII fast path, then fall back to inlined pg_utf_mblen() */
+
+		int len = 0;
+
+#define STRIDE_LENGTH 16
+
+		/* fast path for ascii text. */
+		while (state->refpoint + STRIDE_LENGTH <= state->last_match)
+		{
+			if (is_valid_ascii((const unsigned char *) state->refpoint, STRIDE_LENGTH))
+			{
+				state->refpoint += STRIDE_LENGTH;
+				state->refpos += STRIDE_LENGTH;
+			}
+			else
+				break;
+		}
+
+		while (state->refpoint < state->last_match)
+		{
+			const unsigned char *s = (const unsigned char *) state->refpoint;
+
+			if ((*s & 0x80) == 0)
+				len = 1;
+			else if ((*s & 0xe0) == 0xc0)
+				len = 2;
+			else if ((*s & 0xf0) == 0xe0)
+				len = 3;
+			else if ((*s & 0xf8) == 0xf0)
+				len = 4;
+			else
+				len = 1;
+
+			state->refpoint += len;
+			state->refpos++;
+		}
+
+		Assert(state->refpoint == state->last_match);
+		return state->refpos + 1;
+#endif
+	}
+
 	else
 	{
 		/* Convert the byte position to char position. */
@@ -1481,6 +1592,7 @@ text_position_get_match_pos(TextPositionState *state)
 	}
 }
 
+
 /*
  * Reset search state to the initial state installed by text_position_setup.
  *
diff --git a/src/common/wchar.c b/src/common/wchar.c
index be931c5e92..2c51c3b6f9 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -12,6 +12,7 @@
  */
 #include "c.h"
 
+#include "mb/pg_utf8.h"
 #include "mb/pg_wchar.h"
 
 
@@ -1750,94 +1751,9 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 	return l;
 }
 
-/*
- * The fast path of the UTF-8 verifier uses a deterministic finite automaton
- * (DFA) for multibyte characters. In a traditional table-driven DFA, the
- * input byte and current state are used to compute an index into an array of
- * state transitions. Since the address of the next transition is dependent
- * on this computation, there is latency in executing the load instruction,
- * and the CPU is not kept busy.
- *
- * Instead, we use a "shift-based" DFA as described by Per Vognsen:
- *
- * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
- *
- * In a shift-based DFA, the input byte is an index into array of integers
- * whose bit pattern encodes the state transitions. To compute the next
- * state, we simply right-shift the integer by the current state and apply a
- * mask. In this scheme, the address of the transition only depends on the
- * input byte, so there is better pipelining.
- *
- * The naming convention for states and transitions was adopted from a UTF-8
- * to UTF-16/32 transcoder, whose table is reproduced below:
- *
- * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
- *
- * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
- * ==========================================================================
- * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
- * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
- *                                                                  |
- * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
- * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
- * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
- *                                                                  |
- * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
- * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
- *                                                                  |
- * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
- * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
- *
- * In the most straightforward implementation, a shift-based DFA for UTF-8
- * requires 64-bit integers to encode the transitions, but with an SMT solver
- * it's possible to find state numbers such that the transitions fit within
- * 32-bit integers, as Dougall Johnson demonstrated:
- *
- * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
- *
- * This packed representation is the reason for the seemingly odd choice of
- * state values below.
- */
 
-/* Error */
-#define	ERR  0
-/* Begin */
-#define	BGN 11
-/* Continuation states, expect 1/2/3 continuation bytes */
-#define	CS1 16
-#define	CS2  1
-#define	CS3  5
-/* Leading byte was E0/ED, expect 1 more continuation byte */
-#define	P3A  6
-#define	P3B 20
-/* Leading byte was F0/F4, expect 2 more continuation bytes */
-#define	P4A 25
-#define	P4B 30
-/* Begin and End are the same state */
-#define	END BGN
-
-/* the encoded state transitions for the lookup table */
-
-/* ASCII */
-#define ASC (END << BGN)
-/* 2-byte lead */
-#define L2A (CS1 << BGN)
-/* 3-byte lead */
-#define L3A (P3A << BGN)
-#define L3B (CS2 << BGN)
-#define L3C (P3B << BGN)
-/* 4-byte lead */
-#define L4A (P4A << BGN)
-#define L4B (CS3 << BGN)
-#define L4C (P4B << BGN)
-/* continuation byte */
-#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
-#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
-#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
-/* invalid byte */
-#define ILL ERR
-
-static const uint32 Utf8Transition[256] =
+/* transition table for the UTF-8 DFA -- see pg_utf8.h for details */
+const uint32 Utf8Transition[256] =
 {
 	/* ASCII */
 
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 6bd996b3d0..d93ccac263 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -699,57 +699,4 @@ extern int	mic2latin_with_table(const unsigned char *mic, unsigned char *p,
 extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
 #endif
 
-
-/*
- * Verify a chunk of bytes for valid ASCII.
- *
- * Returns false if the input contains any zero bytes or bytes with the
- * high-bit set. Input len must be a multiple of 8.
- */
-static inline bool
-is_valid_ascii(const unsigned char *s, int len)
-{
-	uint64		chunk,
-				highbit_cum = UINT64CONST(0),
-				zero_cum = UINT64CONST(0x8080808080808080);
-
-	Assert(len % sizeof(chunk) == 0);
-
-	while (len > 0)
-	{
-		memcpy(&chunk, s, sizeof(chunk));
-
-		/*
-		 * Capture any zero bytes in this chunk.
-		 *
-		 * First, add 0x7f to each byte. This sets the high bit in each byte,
-		 * unless it was a zero. If any resulting high bits are zero, the
-		 * corresponding high bits in the zero accumulator will be cleared.
-		 *
-		 * If none of the bytes in the chunk had the high bit set, the max
-		 * value each byte can have after the addition is 0x7f + 0x7f = 0xfe,
-		 * and we don't need to worry about carrying over to the next byte. If
-		 * any input bytes did have the high bit set, it doesn't matter
-		 * because we check for those separately.
-		 */
-		zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
-
-		/* Capture any set bits in this chunk. */
-		highbit_cum |= chunk;
-
-		s += sizeof(chunk);
-		len -= sizeof(chunk);
-	}
-
-	/* Check if any high bits in the high bit accumulator got set. */
-	if (highbit_cum & UINT64CONST(0x8080808080808080))
-		return false;
-
-	/* Check if any high bits in the zero accumulator got cleared. */
-	if (zero_cum != UINT64CONST(0x8080808080808080))
-		return false;
-
-	return true;
-}
-
 #endif							/* PG_WCHAR_H */
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index 351d79e1f0..7f23cffa21 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -1206,3 +1206,21 @@ binary_coercible(PG_FUNCTION_ARGS)
 
 	PG_RETURN_BOOL(IsBinaryCoercible(srctype, targettype));
 }
+
+
+PG_FUNCTION_INFO_V1(benchmark_textpos);
+Datum
+benchmark_textpos(PG_FUNCTION_ARGS)
+{
+	text	   *t1 = PG_GETARG_TEXT_PP(0);
+	text	   *t2 = PG_GETARG_TEXT_PP(1);
+	int			loops = PG_GETARG_INT32(2);
+	int			i;
+	Datum		result = Int32GetDatum(0);
+
+	for (i = 0; i < loops; i++)
+	{
+		result = DirectFunctionCall2Coll(textpos, PG_GET_COLLATION(), PointerGetDatum(t1), PointerGetDatum(t2));
+	}
+	return result;
+}

textpos-benchmark-utf8.sql
Description: application/sql

speed up text_position() for utf-8

Reply via email to