On Tue, Dec 26, 2023 at 8:49 AM Andrew Dunstan <and...@dunslane.net> wrote:
> Quite a long time ago Robert asked me about the possibility of an
> incremental JSON parser. I wrote one, and I've tweaked it a bit, but the
> performance is significantly worse that that of the current Recursive
> Descent parser.

The prediction stack is neat. It seems like the main loop is hit so
many thousands of times that micro-optimization would be necessary...
I attached a sample diff to get rid of the strlen calls during
push_prediction(), which speeds things up a bit (8-15%, depending on
optimization level) on my machines.

Maybe it's possible to condense some of those productions down, and
reduce the loop count? E.g. does every "scalar" production need to go
three times through the loop/stack, or can the scalar semantic action
just peek at the next token prediction and do all the callback work at
once?

> +               case JSON_SEM_SCALAR_CALL:
> +                   {
> +                       json_scalar_action sfunc = sem->scalar;
> +
> +                       if (sfunc != NULL)
> +                           (*sfunc) (sem->semstate, scalar_val, scalar_tok);
> +                   }

Is it safe to store state (scalar_val/scalar_tok) on the stack, or
does it disappear if the parser hits an incomplete token?

> One possible use would be in parsing large manifest files for
> incremental backup.

I'm keeping an eye on this thread for OAuth, since the clients have to
parse JSON as well. Those responses tend to be smaller, though, so
you'd have to really be hurting for resources to need this.

--Jacob
commit 79d0dc78b9f3b0bbc078876417b8f46970308e6e
Author: Jacob Champion <champio...@gmail.com>
Date:   Thu Jan 4 11:46:06 2024 -0800

    WIP: try to speed up prediction

diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c
index d875d57df7..9149d45a4b 100644
--- a/src/common/jsonapi.c
+++ b/src/common/jsonapi.c
@@ -165,39 +165,47 @@ static char JSON_PROD_MORE_KEY_PAIRS[] = 
{JSON_NT_MORE_KEY_PAIRS, JSON_NT_KEYPAI
  * Any combination not specified here represents an error.
  */
 
-static char *td_parser_table[JSON_NUM_NONTERMINALS][JSON_NUM_TERMINALS] =
+struct td_entry
+{
+       size_t          len;
+       char       *prod;
+};
+
+#define TD_ENTRY(PROD) { sizeof(PROD) - 1, (PROD) }
+
+static struct td_entry 
td_parser_table[JSON_NUM_NONTERMINALS][JSON_NUM_TERMINALS] =
 {
        /* JSON */
-       [OFS(JSON_NT_JSON)][JSON_TOKEN_STRING] = JSON_PROD_SCALAR_STRING,
-       [OFS(JSON_NT_JSON)][JSON_TOKEN_NUMBER] = JSON_PROD_SCALAR_NUMBER,
-       [OFS(JSON_NT_JSON)][JSON_TOKEN_TRUE] = JSON_PROD_SCALAR_TRUE,
-       [OFS(JSON_NT_JSON)][JSON_TOKEN_FALSE] = JSON_PROD_SCALAR_FALSE,
-       [OFS(JSON_NT_JSON)][JSON_TOKEN_NULL] = JSON_PROD_SCALAR_NULL,
-       [OFS(JSON_NT_JSON)][JSON_TOKEN_ARRAY_START] = JSON_PROD_ARRAY,
-       [OFS(JSON_NT_JSON)][JSON_TOKEN_OBJECT_START] = JSON_PROD_OBJECT,
+       [OFS(JSON_NT_JSON)][JSON_TOKEN_STRING] = 
TD_ENTRY(JSON_PROD_SCALAR_STRING),
+       [OFS(JSON_NT_JSON)][JSON_TOKEN_NUMBER] = 
TD_ENTRY(JSON_PROD_SCALAR_NUMBER),
+       [OFS(JSON_NT_JSON)][JSON_TOKEN_TRUE] = TD_ENTRY(JSON_PROD_SCALAR_TRUE),
+       [OFS(JSON_NT_JSON)][JSON_TOKEN_FALSE] = 
TD_ENTRY(JSON_PROD_SCALAR_FALSE),
+       [OFS(JSON_NT_JSON)][JSON_TOKEN_NULL] = TD_ENTRY(JSON_PROD_SCALAR_NULL),
+       [OFS(JSON_NT_JSON)][JSON_TOKEN_ARRAY_START] = TD_ENTRY(JSON_PROD_ARRAY),
+       [OFS(JSON_NT_JSON)][JSON_TOKEN_OBJECT_START] = 
TD_ENTRY(JSON_PROD_OBJECT),
        /* ARRAY_ELEMENTS */
-       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_START] = 
JSON_PROD_ARRAY_ELEMENTS,
-       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_OBJECT_START] = 
JSON_PROD_ARRAY_ELEMENTS,
-       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_STRING] = 
JSON_PROD_ARRAY_ELEMENTS,
-       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NUMBER] = 
JSON_PROD_ARRAY_ELEMENTS,
-       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_TRUE] = 
JSON_PROD_ARRAY_ELEMENTS,
-       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_FALSE] = 
JSON_PROD_ARRAY_ELEMENTS,
-       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NULL] = 
JSON_PROD_ARRAY_ELEMENTS,
-       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = JSON_PROD_EPSILON,
+       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_START] = 
TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_OBJECT_START] = 
TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_STRING] = 
TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NUMBER] = 
TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_TRUE] = 
TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_FALSE] = 
TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NULL] = 
TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
+       [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = 
TD_ENTRY(JSON_PROD_EPSILON),
        /* MORE_ARRAY_ELEMENTS */
-       [OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_COMMA] = 
JSON_PROD_MORE_ARRAY_ELEMENTS,
-       [OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = 
JSON_PROD_EPSILON,
+       [OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_COMMA] = 
TD_ENTRY(JSON_PROD_MORE_ARRAY_ELEMENTS),
+       [OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = 
TD_ENTRY(JSON_PROD_EPSILON),
 
        /*
         * KEYPAIR - could expand but it's clearer not to.
         */
-       [OFS(JSON_NT_KEYPAIR)][JSON_TOKEN_STRING] = JSON_PROD_KEYPAIR,
+       [OFS(JSON_NT_KEYPAIR)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_KEYPAIR),
        /* KEY_PAIRS */
-       [OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_STRING] = JSON_PROD_KEY_PAIRS,
-       [OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = JSON_PROD_EPSILON,
+       [OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_STRING] = 
TD_ENTRY(JSON_PROD_KEY_PAIRS),
+       [OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = 
TD_ENTRY(JSON_PROD_EPSILON),
        /* MORE_KEY_PAIRS */
-       [OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_COMMA] = 
JSON_PROD_MORE_KEY_PAIRS,
-       [OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = 
JSON_PROD_EPSILON,
+       [OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_COMMA] = 
TD_ENTRY(JSON_PROD_MORE_KEY_PAIRS),
+       [OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = 
TD_ENTRY(JSON_PROD_EPSILON),
 };
 
 /* the GOAL production. Not stored in the table, but will be the initial 
contents of the prediction stack */
@@ -419,11 +427,10 @@ dec_lex_level(JsonLexContext *lex)
 }
 
 static inline void
-push_prediction(JsonParserStack *pstack, char *prod)
+push_prediction(JsonParserStack *pstack, struct td_entry entry)
 {
-       int len = strlen(prod);
-       memcpy(pstack->prediction + pstack->pred_index, prod, len);
-       pstack->pred_index += len;
+       memcpy(pstack->prediction + pstack->pred_index, entry.prod, entry.len);
+       pstack->pred_index += entry.len;
 }
 
 static inline char
@@ -651,12 +658,15 @@ pg_parse_json_incremental(JsonLexContext *lex,
        /* use prediction stack for incremental parsing */
 
        if (!have_prediction(pstack))
-               push_prediction(pstack, JSON_PROD_GOAL);
+       {
+               struct td_entry goal = TD_ENTRY(JSON_PROD_GOAL);
+               push_prediction(pstack, goal);
+       }
 
        while (have_prediction(pstack))
        {
                char            top = pop_prediction(pstack);
-               char       *prod;
+               struct td_entry entry;
 
                /*
                 * these first two branches are the guts of the Table Driven
@@ -676,14 +686,14 @@ pg_parse_json_incremental(JsonLexContext *lex,
                                tok = lex_peek(lex);
                        }
                }
-               else if (IS_NT(top) && (prod = td_parser_table[OFS(top)][tok]) 
!= NULL)
+               else if (IS_NT(top) && (entry = 
td_parser_table[OFS(top)][tok]).prod != NULL)
                {
                        /*
                         * the token is in the director set for a production of 
the
                         * non-terminal at the top of the stack, so push the 
reversed
                         * RHS of the production onto the stack.
                         */
-                       push_prediction(pstack, prod);
+                       push_prediction(pstack, entry);
                }
                else if (IS_SEM(top))
                {

Reply via email to