Re: speed up verifying UTF-8

John Naylor Tue, 24 Aug 2021 09:01:00 -0700

Naively, the shift-based DFA requires 64-bit integers to encode the
transitions, but I recently came across an idea from Dougall Johnson of
using the Z3 SMT solver to pack the transitions into 32-bit integers [1].
That halves the size of the transition table for free. I adapted that
effort to the existing conventions in v22 and arrived at the attached
python script. Running the script outputs the following:


$ python dfa-pack-pg.py
offsets: [0, 11, 16, 1, 5, 6, 20, 25, 30]
transitions:
00000000000000000000000000000000 0x0
00000000000000000101100000000000 0x5800
00000000000000001000000000000000 0x8000
00000000000000000000100000000000 0x800
00000000000000000010100000000000 0x2800
00000000000000000011000000000000 0x3000
00000000000000001010000000000000 0xa000
00000000000000001100100000000000 0xc800
00000000000000001111000000000000 0xf000
01000001000010110000000000100000 0x410b0020
00000011000010110000000000100000 0x30b0020
00000010000010110000010000100000 0x20b0420

I'll include something like the attached text file diff in the next patch.
Some comments are now outdated, but this is good enough for demonstration.

[1] https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
--
John Naylor
EDB: http://www.enterprisedb.com

diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
index 25e3031b48..0505feffc5 100644
--- a/src/port/pg_utf8_fallback.c
+++ b/src/port/pg_utf8_fallback.c
@@ -29,7 +29,7 @@
  * With six bits per state, the mask is 63, whose importance is described
  * later.
  */
-#define DFA_BITS_PER_STATE 6
+#define DFA_BITS_PER_STATE 5
 #define DFA_MASK ((1 << DFA_BITS_PER_STATE) - 1)
 
 
@@ -82,19 +82,19 @@
  */
 
 /* Invalid state */
-#define        ERR  UINT64CONST(0)
+#define        ERR 0
 /* Begin */
-#define        BGN (UINT64CONST(1) * DFA_BITS_PER_STATE)
+#define        BGN 11
 /* Continuation states */
-#define        CS1 (UINT64CONST(2) * DFA_BITS_PER_STATE)       /* expect 1 
cont. byte */
-#define        CS2 (UINT64CONST(3) * DFA_BITS_PER_STATE)       /* expect 2 
cont. bytes */
-#define        CS3 (UINT64CONST(4) * DFA_BITS_PER_STATE)       /* expect 3 
cont. bytes */
+#define        CS1 16  /* expect 1 cont. byte */
+#define        CS2  1  /* expect 2 cont. bytes */
+#define        CS3  5  /* expect 3 cont. bytes */
 /* Partial 3-byte sequence states, expect 1 more cont. byte */
-#define        P3A (UINT64CONST(5) * DFA_BITS_PER_STATE)       /* leading byte 
was E0 */
-#define        P3B (UINT64CONST(6) * DFA_BITS_PER_STATE)       /* leading byte 
was ED */
+#define        P3A  6  /* leading byte was E0 */
+#define        P3B 20  /* leading byte was ED */
 /* Partial 4-byte sequence states, expect 2 more cont. bytes */
-#define        P4A (UINT64CONST(7) * DFA_BITS_PER_STATE)       /* leading byte 
was F0 */
-#define        P4B (UINT64CONST(8) * DFA_BITS_PER_STATE)       /* leading byte 
was F4 */
+#define        P4A 25  /* leading byte was F0 */
+#define        P4B 30  /* leading byte was F4 */
 /* Begin and End are the same state */
 #define        END BGN
 
@@ -123,7 +123,7 @@
 #define L4C (P4B << BGN)
 
 /* map an input byte to its byte category */
-const uint64 ByteCategory[256] =
+const uint32 ByteCategory[256] =
 {
        /* ASCII */
 
@@ -181,7 +181,7 @@ const uint64 ByteCategory[256] =
 };
 
 static inline void
-utf8_advance(const unsigned char *s, uint64 *state, int len)
+utf8_advance(const unsigned char *s, uint32 *state, int len)
 {
        /* Note: We deliberately don't check the state within the loop. */
        while (len > 0)
@@ -216,7 +216,7 @@ int
 pg_validate_utf8_fallback(const unsigned char *s, int len)
 {
        const int       orig_len = len;
-       uint64          state = BGN;
+       uint32          state = BGN;
 
        while (len >= STRIDE_LENGTH)
        {

# pack UTF-8 shift-based DFA into 32-bit integers
#
# based on work by Dougall Johnson:
# https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f

from z3 import *

####### spec for 32-bit shift-based DFA

num_states = 9

# unique state->state transitions for all character classes
transitions = [
#ER BGN CS1 2 3 P3A B P4A B
 (0, 0, 0, 0, 0, 0, 0, 0, 0), # ILL
 (0, 1, 0, 0, 0, 0, 0, 0, 0), # ASC
 (0, 2, 0, 0, 0, 0, 0, 0, 0), # L2A
 (0, 3, 0, 0, 0, 0, 0, 0, 0), # L3B
 (0, 4, 0, 0, 0, 0, 0, 0, 0), # L4B
 (0, 5, 0, 0, 0, 0, 0, 0, 0), # L3A
 (0, 6, 0, 0, 0, 0, 0, 0, 0), # L3C
 (0, 7, 0, 0, 0, 0, 0, 0, 0), # L4A
 (0, 8, 0, 0, 0, 0, 0, 0, 0), # L4C
 (0, 0, 1, 2, 3, 0, 2, 0, 3), # CR1
 (0, 0, 1, 2, 3, 0, 2, 3, 0), # CR2
 (0, 0, 1, 2, 3, 2, 0, 3, 0)  # CR3
]

s = Solver()

offsets = [BitVec('o%d' % i, 32) for i in range(num_states)]
values  = [BitVec('v%d' % i, 32) for i in range(len(transitions))]

for i in range(len(offsets)):
	s.add(offsets[i] < 32)
	for j in range(i+1, len(offsets)):
		s.add(offsets[i] != offsets[j])

for vn, targets in enumerate((transitions)):
	for off, target in enumerate(targets):
		s.add(((values[vn] >> offsets[off]) & 31) == offsets[target])


####### not strictly necessary, but keep things consistent

# set error state to zero
s.add(offsets[0] == 0)

# avoid sign extension
for v in values[1:]:
	s.add(v > 0)

# force transitions to match expressions based on the states (i.e. keep "dark" bits out)
s.add(values[0] == 0)
for vlead in values[1:9]:
	s.add(vlead & (31 << offsets[1]) == vlead)
s.add(values[9]  == (offsets[1] << offsets[2]) | (offsets[2] << offsets[3]) | (offsets[3] << offsets[4]) | (offsets[2] << offsets[6]) | (offsets[3] << offsets[8]))
s.add(values[10] == (offsets[1] << offsets[2]) | (offsets[2] << offsets[3]) | (offsets[3] << offsets[4]) | (offsets[2] << offsets[6]) | (offsets[3] << offsets[7]))
s.add(values[11] == (offsets[1] << offsets[2]) | (offsets[2] << offsets[3]) | (offsets[3] << offsets[4]) | (offsets[2] << offsets[5]) | (offsets[3] << offsets[7]))

# use increasing order where possible to make it look nicer
s.add(offsets[4] < offsets[5])
s.add(offsets[5] < offsets[6])
s.add(offsets[6] < offsets[7])
s.add(offsets[7] < offsets[8])


####### run the solver

if s.check() == sat:
	offsets = [s.model()[i].as_long() for i in offsets]
	print('offsets:', offsets)

	values = [s.model()[i].as_long() for i in values]
	print('transitions:')
	for v in values:
		print(format(v, '032b'), hex(v))

Re: speed up verifying UTF-8

Reply via email to