On Tue, Dec 26, 2023 at 8:49 AM Andrew Dunstan <and...@dunslane.net> wrote: > Quite a long time ago Robert asked me about the possibility of an > incremental JSON parser. I wrote one, and I've tweaked it a bit, but the > performance is significantly worse that that of the current Recursive > Descent parser.
The prediction stack is neat. It seems like the main loop is hit so many thousands of times that micro-optimization would be necessary... I attached a sample diff to get rid of the strlen calls during push_prediction(), which speeds things up a bit (8-15%, depending on optimization level) on my machines. Maybe it's possible to condense some of those productions down, and reduce the loop count? E.g. does every "scalar" production need to go three times through the loop/stack, or can the scalar semantic action just peek at the next token prediction and do all the callback work at once? > + case JSON_SEM_SCALAR_CALL: > + { > + json_scalar_action sfunc = sem->scalar; > + > + if (sfunc != NULL) > + (*sfunc) (sem->semstate, scalar_val, scalar_tok); > + } Is it safe to store state (scalar_val/scalar_tok) on the stack, or does it disappear if the parser hits an incomplete token? > One possible use would be in parsing large manifest files for > incremental backup. I'm keeping an eye on this thread for OAuth, since the clients have to parse JSON as well. Those responses tend to be smaller, though, so you'd have to really be hurting for resources to need this. --Jacob
commit 79d0dc78b9f3b0bbc078876417b8f46970308e6e Author: Jacob Champion <champio...@gmail.com> Date: Thu Jan 4 11:46:06 2024 -0800 WIP: try to speed up prediction diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c index d875d57df7..9149d45a4b 100644 --- a/src/common/jsonapi.c +++ b/src/common/jsonapi.c @@ -165,39 +165,47 @@ static char JSON_PROD_MORE_KEY_PAIRS[] = {JSON_NT_MORE_KEY_PAIRS, JSON_NT_KEYPAI * Any combination not specified here represents an error. */ -static char *td_parser_table[JSON_NUM_NONTERMINALS][JSON_NUM_TERMINALS] = +struct td_entry +{ + size_t len; + char *prod; +}; + +#define TD_ENTRY(PROD) { sizeof(PROD) - 1, (PROD) } + +static struct td_entry td_parser_table[JSON_NUM_NONTERMINALS][JSON_NUM_TERMINALS] = { /* JSON */ - [OFS(JSON_NT_JSON)][JSON_TOKEN_STRING] = JSON_PROD_SCALAR_STRING, - [OFS(JSON_NT_JSON)][JSON_TOKEN_NUMBER] = JSON_PROD_SCALAR_NUMBER, - [OFS(JSON_NT_JSON)][JSON_TOKEN_TRUE] = JSON_PROD_SCALAR_TRUE, - [OFS(JSON_NT_JSON)][JSON_TOKEN_FALSE] = JSON_PROD_SCALAR_FALSE, - [OFS(JSON_NT_JSON)][JSON_TOKEN_NULL] = JSON_PROD_SCALAR_NULL, - [OFS(JSON_NT_JSON)][JSON_TOKEN_ARRAY_START] = JSON_PROD_ARRAY, - [OFS(JSON_NT_JSON)][JSON_TOKEN_OBJECT_START] = JSON_PROD_OBJECT, + [OFS(JSON_NT_JSON)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_SCALAR_STRING), + [OFS(JSON_NT_JSON)][JSON_TOKEN_NUMBER] = TD_ENTRY(JSON_PROD_SCALAR_NUMBER), + [OFS(JSON_NT_JSON)][JSON_TOKEN_TRUE] = TD_ENTRY(JSON_PROD_SCALAR_TRUE), + [OFS(JSON_NT_JSON)][JSON_TOKEN_FALSE] = TD_ENTRY(JSON_PROD_SCALAR_FALSE), + [OFS(JSON_NT_JSON)][JSON_TOKEN_NULL] = TD_ENTRY(JSON_PROD_SCALAR_NULL), + [OFS(JSON_NT_JSON)][JSON_TOKEN_ARRAY_START] = TD_ENTRY(JSON_PROD_ARRAY), + [OFS(JSON_NT_JSON)][JSON_TOKEN_OBJECT_START] = TD_ENTRY(JSON_PROD_OBJECT), /* ARRAY_ELEMENTS */ - [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_START] = JSON_PROD_ARRAY_ELEMENTS, - [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_OBJECT_START] = JSON_PROD_ARRAY_ELEMENTS, - [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_STRING] = JSON_PROD_ARRAY_ELEMENTS, - [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NUMBER] = JSON_PROD_ARRAY_ELEMENTS, - [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_TRUE] = JSON_PROD_ARRAY_ELEMENTS, - [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_FALSE] = JSON_PROD_ARRAY_ELEMENTS, - [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NULL] = JSON_PROD_ARRAY_ELEMENTS, - [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = JSON_PROD_EPSILON, + [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_START] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS), + [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_OBJECT_START] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS), + [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS), + [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NUMBER] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS), + [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_TRUE] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS), + [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_FALSE] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS), + [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NULL] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS), + [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = TD_ENTRY(JSON_PROD_EPSILON), /* MORE_ARRAY_ELEMENTS */ - [OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_COMMA] = JSON_PROD_MORE_ARRAY_ELEMENTS, - [OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = JSON_PROD_EPSILON, + [OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_COMMA] = TD_ENTRY(JSON_PROD_MORE_ARRAY_ELEMENTS), + [OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = TD_ENTRY(JSON_PROD_EPSILON), /* * KEYPAIR - could expand but it's clearer not to. */ - [OFS(JSON_NT_KEYPAIR)][JSON_TOKEN_STRING] = JSON_PROD_KEYPAIR, + [OFS(JSON_NT_KEYPAIR)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_KEYPAIR), /* KEY_PAIRS */ - [OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_STRING] = JSON_PROD_KEY_PAIRS, - [OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = JSON_PROD_EPSILON, + [OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_KEY_PAIRS), + [OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = TD_ENTRY(JSON_PROD_EPSILON), /* MORE_KEY_PAIRS */ - [OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_COMMA] = JSON_PROD_MORE_KEY_PAIRS, - [OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = JSON_PROD_EPSILON, + [OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_COMMA] = TD_ENTRY(JSON_PROD_MORE_KEY_PAIRS), + [OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = TD_ENTRY(JSON_PROD_EPSILON), }; /* the GOAL production. Not stored in the table, but will be the initial contents of the prediction stack */ @@ -419,11 +427,10 @@ dec_lex_level(JsonLexContext *lex) } static inline void -push_prediction(JsonParserStack *pstack, char *prod) +push_prediction(JsonParserStack *pstack, struct td_entry entry) { - int len = strlen(prod); - memcpy(pstack->prediction + pstack->pred_index, prod, len); - pstack->pred_index += len; + memcpy(pstack->prediction + pstack->pred_index, entry.prod, entry.len); + pstack->pred_index += entry.len; } static inline char @@ -651,12 +658,15 @@ pg_parse_json_incremental(JsonLexContext *lex, /* use prediction stack for incremental parsing */ if (!have_prediction(pstack)) - push_prediction(pstack, JSON_PROD_GOAL); + { + struct td_entry goal = TD_ENTRY(JSON_PROD_GOAL); + push_prediction(pstack, goal); + } while (have_prediction(pstack)) { char top = pop_prediction(pstack); - char *prod; + struct td_entry entry; /* * these first two branches are the guts of the Table Driven @@ -676,14 +686,14 @@ pg_parse_json_incremental(JsonLexContext *lex, tok = lex_peek(lex); } } - else if (IS_NT(top) && (prod = td_parser_table[OFS(top)][tok]) != NULL) + else if (IS_NT(top) && (entry = td_parser_table[OFS(top)][tok]).prod != NULL) { /* * the token is in the director set for a production of the * non-terminal at the top of the stack, so push the reversed * RHS of the production onto the stack. */ - push_prediction(pstack, prod); + push_prediction(pstack, entry); } else if (IS_SEM(top)) {