Hi, Commit 9556aa01c69 (Use single-byte Boyer-Moore-Horspool search even with multibyte encodings), was a speed improvement for the majority of cases, but when the match was toward the end of the string, the slowdown in text_position_get_match_pos() was noticeable. It was found that there was a lot of overhead in pg_mblen(). [1]
The attached exploratory PoC improves this for utf-8. It applies on top v25 of my utf-8 verification patch in [2], since one approach relies on the DFA from it. The other three approaches are: - a version of pg_utf_mblen() that uses a lookup table [3] - an inlined copy of pg_utf_mblen() - an ascii fast path with a fallback to the inlined copy of pg_utf_mblen() The test is attached and the test function is part of the patch. It's based on the test used in the commit above. The test searches for a string that's at the end of a ~1 million byte string. This is on gcc 11 with 2-3 runs to ensure repeatability, but I didn't bother with statistics because the differences are pretty big: patch | no match | ascii | mulitbyte -----------------------------------------+----------+-------+----------- PG11 | 1120 | 1100 | 900 master | 381 | 2350 | 1900 DFA | 386 | 1640 | 1640 branchless utf mblen | 387 | 4100 | 2600 inline pg_utf_mblen() | 380 | 1080 | 920 inline pg_utf_mblen() + ascii fast path | 382 | 470 | 918 Neither of the branchless approaches worked well. The DFA can't work as well here as in verification because it must do additional work. Inlining pg_utf_mblen() restores worst-case performance to PG11 levels. The ascii fast path is a nice improvement on top of that. A similar approach could work for pg_mbstrlen() as well, but I haven't looked into that yet. There are other callers of pg_mblen(), but I haven't looked into whether they are performance-sensitive. A more general application would be preferable to a targeted one. [1] https://www.postgresql.org/message-id/b65df3d8-1f59-3bd7-ebbe-68b81d5a76a4%40iki.fi [2] https://www.postgresql.org/message-id/CAFBsxsHG%3Dg6W8Mie%2B_NO8dV6O0pO2stxrnS%3Dme5ZmGqk--fd5g%40mail.gmail.com [3] https://github.com/skeeto/branchless-utf8/blob/master/utf8.h -- John Naylor EDB: http://www.enterprisedb.com
src/backend/utils/adt/varlena.c | 112 ++++++++++++++++++++++++++++++++++++++++ src/common/wchar.c | 90 ++------------------------------ src/include/mb/pg_wchar.h | 53 ------------------- src/test/regress/regress.c | 18 +++++++ 4 files changed, 133 insertions(+), 140 deletions(-) diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index bd3091bbfb..cc93818007 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -26,6 +26,7 @@ #include "common/unicode_norm.h" #include "lib/hyperloglog.h" #include "libpq/pqformat.h" +#include "mb/pg_utf8.h" #include "miscadmin.h" #include "nodes/execnodes.h" #include "parser/scansup.h" @@ -1468,6 +1469,116 @@ text_position_get_match_pos(TextPositionState *state) { if (!state->is_multibyte) return state->last_match - state->str1 + 1; + else if (GetDatabaseEncoding() == PG_UTF8) + { +#if 0 /* dfa */ + + int utf8_state_count[UTF8_LAST_STATE + 1] = {0}; + uint32 dfa_state = BGN; + + /* + * See pg_utf8.h for an explanation of the state machine. Unlike in + * pg_wchar.c, we must calculate the exact state so we can use it as + * an array index. + */ + while (state->refpoint < state->last_match) + { + const unsigned char *s = (const unsigned char *) state->refpoint++; + + dfa_state = (Utf8Transition[*s] >> dfa_state) & 31; + utf8_state_count[dfa_state]++; + } + Assert(state->refpoint == state->last_match); + state->refpos += utf8_state_count[END]; + return state->refpos + 1; + +#endif +#if 0 /* like pg_utf_mblen(), but different algorithm: */ + + static const char lengths[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 1 + }; + + while (state->refpoint < state->last_match) + { + const unsigned char *s = (const unsigned char *) state->refpoint; + + state->refpoint += lengths[s[0] >> 3]; + state->refpos++; + } + + Assert(state->refpoint == state->last_match); + return state->refpos + 1; +#endif +#if 0 /* inline pg_utf_mblen()*/ + + int len = 0; + + while (state->refpoint < state->last_match) + { + const unsigned char *s = (const unsigned char *) state->refpoint; + + if ((*s & 0x80) == 0) + len = 1; + else if ((*s & 0xe0) == 0xc0) + len = 2; + else if ((*s & 0xf0) == 0xe0) + len = 3; + else if ((*s & 0xf8) == 0xf0) + len = 4; + else + len = 1; + + state->refpoint += len; + state->refpos++; + } + + Assert(state->refpoint == state->last_match); + return state->refpos + 1; +#endif +#if 1 /* try ASCII fast path, then fall back to inlined pg_utf_mblen() */ + + int len = 0; + +#define STRIDE_LENGTH 16 + + /* fast path for ascii text. */ + while (state->refpoint + STRIDE_LENGTH <= state->last_match) + { + if (is_valid_ascii((const unsigned char *) state->refpoint, STRIDE_LENGTH)) + { + state->refpoint += STRIDE_LENGTH; + state->refpos += STRIDE_LENGTH; + } + else + break; + } + + while (state->refpoint < state->last_match) + { + const unsigned char *s = (const unsigned char *) state->refpoint; + + if ((*s & 0x80) == 0) + len = 1; + else if ((*s & 0xe0) == 0xc0) + len = 2; + else if ((*s & 0xf0) == 0xe0) + len = 3; + else if ((*s & 0xf8) == 0xf0) + len = 4; + else + len = 1; + + state->refpoint += len; + state->refpos++; + } + + Assert(state->refpoint == state->last_match); + return state->refpos + 1; +#endif + } + else { /* Convert the byte position to char position. */ @@ -1481,6 +1592,7 @@ text_position_get_match_pos(TextPositionState *state) } } + /* * Reset search state to the initial state installed by text_position_setup. * diff --git a/src/common/wchar.c b/src/common/wchar.c index be931c5e92..2c51c3b6f9 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -12,6 +12,7 @@ */ #include "c.h" +#include "mb/pg_utf8.h" #include "mb/pg_wchar.h" @@ -1750,94 +1751,9 @@ pg_utf8_verifychar(const unsigned char *s, int len) return l; } -/* - * The fast path of the UTF-8 verifier uses a deterministic finite automaton - * (DFA) for multibyte characters. In a traditional table-driven DFA, the - * input byte and current state are used to compute an index into an array of - * state transitions. Since the address of the next transition is dependent - * on this computation, there is latency in executing the load instruction, - * and the CPU is not kept busy. - * - * Instead, we use a "shift-based" DFA as described by Per Vognsen: - * - * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725 - * - * In a shift-based DFA, the input byte is an index into array of integers - * whose bit pattern encodes the state transitions. To compute the next - * state, we simply right-shift the integer by the current state and apply a - * mask. In this scheme, the address of the transition only depends on the - * input byte, so there is better pipelining. - * - * The naming convention for states and transitions was adopted from a UTF-8 - * to UTF-16/32 transcoder, whose table is reproduced below: - * - * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp - * - * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE - * ========================================================================== - * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END - * err, err, err, err, err, err, err, err, err, err, err, err, | ERR - * | - * err, err, END, END, END, err, err, err, err, err, err, err, | CS1 - * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2 - * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3 - * | - * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A - * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B - * | - * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A - * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B - * - * In the most straightforward implementation, a shift-based DFA for UTF-8 - * requires 64-bit integers to encode the transitions, but with an SMT solver - * it's possible to find state numbers such that the transitions fit within - * 32-bit integers, as Dougall Johnson demonstrated: - * - * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f - * - * This packed representation is the reason for the seemingly odd choice of - * state values below. - */ -/* Error */ -#define ERR 0 -/* Begin */ -#define BGN 11 -/* Continuation states, expect 1/2/3 continuation bytes */ -#define CS1 16 -#define CS2 1 -#define CS3 5 -/* Leading byte was E0/ED, expect 1 more continuation byte */ -#define P3A 6 -#define P3B 20 -/* Leading byte was F0/F4, expect 2 more continuation bytes */ -#define P4A 25 -#define P4B 30 -/* Begin and End are the same state */ -#define END BGN - -/* the encoded state transitions for the lookup table */ - -/* ASCII */ -#define ASC (END << BGN) -/* 2-byte lead */ -#define L2A (CS1 << BGN) -/* 3-byte lead */ -#define L3A (P3A << BGN) -#define L3B (CS2 << BGN) -#define L3C (P3B << BGN) -/* 4-byte lead */ -#define L4A (P4A << BGN) -#define L4B (CS3 << BGN) -#define L4C (P4B << BGN) -/* continuation byte */ -#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B) -#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A) -#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A) -/* invalid byte */ -#define ILL ERR - -static const uint32 Utf8Transition[256] = +/* transition table for the UTF-8 DFA -- see pg_utf8.h for details */ +const uint32 Utf8Transition[256] = { /* ASCII */ diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 6bd996b3d0..d93ccac263 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -699,57 +699,4 @@ extern int mic2latin_with_table(const unsigned char *mic, unsigned char *p, extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len); #endif - -/* - * Verify a chunk of bytes for valid ASCII. - * - * Returns false if the input contains any zero bytes or bytes with the - * high-bit set. Input len must be a multiple of 8. - */ -static inline bool -is_valid_ascii(const unsigned char *s, int len) -{ - uint64 chunk, - highbit_cum = UINT64CONST(0), - zero_cum = UINT64CONST(0x8080808080808080); - - Assert(len % sizeof(chunk) == 0); - - while (len > 0) - { - memcpy(&chunk, s, sizeof(chunk)); - - /* - * Capture any zero bytes in this chunk. - * - * First, add 0x7f to each byte. This sets the high bit in each byte, - * unless it was a zero. If any resulting high bits are zero, the - * corresponding high bits in the zero accumulator will be cleared. - * - * If none of the bytes in the chunk had the high bit set, the max - * value each byte can have after the addition is 0x7f + 0x7f = 0xfe, - * and we don't need to worry about carrying over to the next byte. If - * any input bytes did have the high bit set, it doesn't matter - * because we check for those separately. - */ - zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f)); - - /* Capture any set bits in this chunk. */ - highbit_cum |= chunk; - - s += sizeof(chunk); - len -= sizeof(chunk); - } - - /* Check if any high bits in the high bit accumulator got set. */ - if (highbit_cum & UINT64CONST(0x8080808080808080)) - return false; - - /* Check if any high bits in the zero accumulator got cleared. */ - if (zero_cum != UINT64CONST(0x8080808080808080)) - return false; - - return true; -} - #endif /* PG_WCHAR_H */ diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index 351d79e1f0..7f23cffa21 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -1206,3 +1206,21 @@ binary_coercible(PG_FUNCTION_ARGS) PG_RETURN_BOOL(IsBinaryCoercible(srctype, targettype)); } + + +PG_FUNCTION_INFO_V1(benchmark_textpos); +Datum +benchmark_textpos(PG_FUNCTION_ARGS) +{ + text *t1 = PG_GETARG_TEXT_PP(0); + text *t2 = PG_GETARG_TEXT_PP(1); + int loops = PG_GETARG_INT32(2); + int i; + Datum result = Int32GetDatum(0); + + for (i = 0; i < loops; i++) + { + result = DirectFunctionCall2Coll(textpos, PG_GET_COLLATION(), PointerGetDatum(t1), PointerGetDatum(t2)); + } + return result; +}
textpos-benchmark-utf8.sql
Description: application/sql