> I'm not through looking at this. However, here are a few preliminary comments
I've attached new versions of the patches with improvements related to these commentaries.
diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c index 6e5de8f..8f7bcfe 100644 --- a/src/backend/tsearch/to_tsany.c +++ b/src/backend/tsearch/to_tsany.c @@ -16,6 +16,7 @@ #include "tsearch/ts_cache.h" #include "tsearch/ts_utils.h" #include "utils/builtins.h" +#include "utils/jsonb.h" typedef struct MorphOpaque @@ -24,6 +25,14 @@ typedef struct MorphOpaque int qoperator; /* query operator */ } MorphOpaque; +typedef struct TSVectorBuildState +{ + ParsedText *prs; + TSVector result; + Oid cfgId; +} TSVectorBuildState; + +static void add_to_tsvector(void *state, char *elem_value, int elem_len); Datum get_current_ts_config(PG_FUNCTION_ARGS) @@ -256,6 +265,109 @@ to_tsvector(PG_FUNCTION_ARGS) PointerGetDatum(in))); } +Datum +jsonb_to_tsvector(PG_FUNCTION_ARGS) +{ + Jsonb *jb = PG_GETARG_JSONB(0); + TSVectorBuildState state; + ParsedText *prs = (ParsedText *) palloc(sizeof(ParsedText)); + + prs->words = NULL; + state.result = NULL; + state.cfgId = getTSCurrentConfig(true); + state.prs = prs; + + iterate_jsonb_values(jb, &state, (JsonIterateAction) add_to_tsvector); + + PG_FREE_IF_COPY(jb, 1); + + if (state.result == NULL) + { + /* There weren't any string elements in jsonb, + * so wee need to return an empty vector */ + + if (prs->words != NULL) + pfree(prs->words); + + state.result = palloc(CALCDATASIZE(0, 0)); + SET_VARSIZE(state.result, CALCDATASIZE(0, 0)); + state.result->size = 0; + } + + PG_RETURN_TSVECTOR(state.result); +} + +Datum +json_to_tsvector(PG_FUNCTION_ARGS) +{ + text *json = PG_GETARG_TEXT_P(0); + TSVectorBuildState state; + ParsedText *prs = (ParsedText *) palloc(sizeof(ParsedText)); + + prs->words = NULL; + state.result = NULL; + state.cfgId = getTSCurrentConfig(true); + state.prs = prs; + + iterate_json_values(json, &state, (JsonIterateAction) add_to_tsvector); + + PG_FREE_IF_COPY(json, 1); + if (state.result == NULL) + { + /* There weren't any string elements in json, + * so wee need to return an empty vector */ + + if (prs->words != NULL) + pfree(prs->words); + + state.result = palloc(CALCDATASIZE(0, 0)); + SET_VARSIZE(state.result, CALCDATASIZE(0, 0)); + state.result->size = 0; + } + + PG_RETURN_TSVECTOR(state.result); +} + +/* + * Extend current TSVector from _state with a new one, + * build over a json(b) element. + */ +static void +add_to_tsvector(void *_state, char *elem_value, int elem_len) +{ + TSVectorBuildState *state = (TSVectorBuildState *) _state; + ParsedText *prs = state->prs; + TSVector item_vector; + int i; + + prs->lenwords = elem_len / 6; + if (prs->lenwords == 0) + prs->lenwords = 2; + + prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords); + prs->curwords = 0; + prs->pos = 0; + + parsetext(state->cfgId, prs, elem_value, elem_len); + + if (prs->curwords) + { + if (state->result != NULL) + { + for (i = 0; i < prs->curwords; i++) + prs->words[i].pos.pos = prs->words[i].pos.pos + TS_JUMP; + + item_vector = make_tsvector(prs); + + state->result = (TSVector) DirectFunctionCall2(tsvector_concat, + TSVectorGetDatum(state->result), + PointerGetDatum(item_vector)); + } + else + state->result = make_tsvector(prs); + } +} + /* * to_tsquery */ diff --git a/src/backend/tsearch/wparser.c b/src/backend/tsearch/wparser.c index 8ca1c62..ab1716a 100644 --- a/src/backend/tsearch/wparser.c +++ b/src/backend/tsearch/wparser.c @@ -20,6 +20,7 @@ #include "tsearch/ts_cache.h" #include "tsearch/ts_utils.h" #include "utils/builtins.h" +#include "utils/jsonb.h" #include "utils/varlena.h" @@ -31,6 +32,19 @@ typedef struct LexDescr *list; } TSTokenTypeStorage; +/* state for ts_headline_json_* */ +typedef struct HeadlineJsonState +{ + HeadlineParsedText *prs; + TSConfigCacheEntry *cfg; + TSParserCacheEntry *prsobj; + TSQuery query; + List *prsoptions; + bool transformed; +} HeadlineJsonState; + +static text * headline_json_value(void *_state, char *elem_value, int elem_len); + static void tt_setup_firstcall(FuncCallContext *funcctx, Oid prsid) { @@ -362,3 +376,177 @@ ts_headline_opt(PG_FUNCTION_ARGS) PG_GETARG_DATUM(1), PG_GETARG_DATUM(2))); } + +Datum +ts_headline_jsonb_byid_opt(PG_FUNCTION_ARGS) +{ + Jsonb *out, *jb = PG_GETARG_JSONB(1); + TSQuery query = PG_GETARG_TSQUERY(2); + text *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL; + + HeadlineParsedText prs; + HeadlineJsonState *state = palloc0(sizeof(HeadlineJsonState)); + + memset(&prs, 0, sizeof(HeadlineParsedText)); + prs.lenwords = 32; + prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords); + + state->prs = &prs; + state->cfg = lookup_ts_config_cache(PG_GETARG_OID(0)); + state->prsobj = lookup_ts_parser_cache(state->cfg->prsId); + state->query = query; + if (opt) + state->prsoptions = deserialize_deflist(PointerGetDatum(opt)); + else + state->prsoptions = NIL; + + if (!OidIsValid(state->prsobj->headlineOid)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("text search parser does not support headline creation"))); + + out = transform_jsonb_values(jb, state, (JsonTransformAction) headline_json_value); + + PG_FREE_IF_COPY(jb, 1); + PG_FREE_IF_COPY(query, 2); + if (opt) + PG_FREE_IF_COPY(opt, 3); + + pfree(prs.words); + + if (state->transformed) + { + pfree(prs.startsel); + pfree(prs.stopsel); + } + + PG_RETURN_JSONB(out); +} + +Datum +ts_headline_jsonb(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_jsonb_byid_opt, + ObjectIdGetDatum(getTSCurrentConfig(true)), + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1))); +} + +Datum +ts_headline_jsonb_byid(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_jsonb_byid_opt, + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(2))); +} + +Datum +ts_headline_jsonb_opt(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_jsonb_byid_opt, + ObjectIdGetDatum(getTSCurrentConfig(true)), + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(2))); +} + +Datum +ts_headline_json_byid_opt(PG_FUNCTION_ARGS) +{ + text *json = PG_GETARG_TEXT_P(1); + TSQuery query = PG_GETARG_TSQUERY(2); + text *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL; + text *out; + + HeadlineParsedText prs; + HeadlineJsonState *state = palloc0(sizeof(HeadlineJsonState)); + + memset(&prs, 0, sizeof(HeadlineParsedText)); + prs.lenwords = 32; + prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords); + + state->prs = &prs; + state->cfg = lookup_ts_config_cache(PG_GETARG_OID(0)); + state->prsobj = lookup_ts_parser_cache(state->cfg->prsId); + state->query = query; + if (opt) + state->prsoptions = deserialize_deflist(PointerGetDatum(opt)); + else + state->prsoptions = NIL; + + if (!OidIsValid(state->prsobj->headlineOid)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("text search parser does not support headline creation"))); + + out = transform_json_values(json, state, (JsonTransformAction) headline_json_value); + + PG_FREE_IF_COPY(json, 1); + PG_FREE_IF_COPY(query, 2); + if (opt) + PG_FREE_IF_COPY(opt, 3); + pfree(prs.words); + + if (state->transformed) + { + pfree(prs.startsel); + pfree(prs.stopsel); + } + + PG_RETURN_TEXT_P(out); +} + +Datum +ts_headline_json(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_json_byid_opt, + ObjectIdGetDatum(getTSCurrentConfig(true)), + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1))); +} + +Datum +ts_headline_json_byid(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_json_byid_opt, + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(2))); +} + +Datum +ts_headline_json_opt(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_json_byid_opt, + ObjectIdGetDatum(getTSCurrentConfig(true)), + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(2))); +} + + +/* + * Return headline in text from, generated from a json(b) element + */ +static text * +headline_json_value(void *_state, char *elem_value, int elem_len) +{ + HeadlineJsonState *state = (HeadlineJsonState *) _state; + + HeadlineParsedText *prs = state->prs; + TSConfigCacheEntry *cfg = state->cfg; + TSParserCacheEntry *prsobj = state->prsobj; + TSQuery query = state->query; + List *prsoptions = state->prsoptions; + + prs->curwords = 0; + hlparsetext(cfg->cfgId, prs, query, elem_value, elem_len); + FunctionCall3(&(prsobj->prsheadline), + PointerGetDatum(prs), + PointerGetDatum(prsoptions), + PointerGetDatum(query)); + + state->transformed = true; + return generateHeadline(prs); +} diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index a4cc86d..dd74bac 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -4759,6 +4759,24 @@ DESCR("generate headline"); DATA(insert OID = 3755 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f s s 2 0 25 "25 3615" _null_ _null_ _null_ _null_ _null_ ts_headline _null_ _null_ _null_ )); DESCR("generate headline"); +DATA(insert OID = 4201 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f i s 4 0 3802 "3734 3802 3615 25" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb_byid_opt _null_ _null_ _null_ )); +DESCR("generate headline from jsonb"); +DATA(insert OID = 4202 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f i s 3 0 3802 "3734 3802 3615" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb_byid _null_ _null_ _null_ )); +DESCR("generate headline from jsonb"); +DATA(insert OID = 4203 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f s s 3 0 3802 "3802 3615 25" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb_opt _null_ _null_ _null_ )); +DESCR("generate headline from jsonb"); +DATA(insert OID = 4204 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f s s 2 0 3802 "3802 3615" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb _null_ _null_ _null_ )); +DESCR("generate headline from jsonb"); + +DATA(insert OID = 4205 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f i s 4 0 114 "3734 114 3615 25" _null_ _null_ _null_ _null_ _null_ ts_headline_json_byid_opt _null_ _null_ _null_ )); +DESCR("generate headline from json"); +DATA(insert OID = 4206 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f i s 3 0 114 "3734 114 3615" _null_ _null_ _null_ _null_ _null_ ts_headline_json_byid _null_ _null_ _null_ )); +DESCR("generate headline from json"); +DATA(insert OID = 4207 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f s s 3 0 114 "114 3615 25" _null_ _null_ _null_ _null_ _null_ ts_headline_json_opt _null_ _null_ _null_ )); +DESCR("generate headline from json"); +DATA(insert OID = 4208 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f s s 2 0 114 "114 3615" _null_ _null_ _null_ _null_ _null_ ts_headline_json _null_ _null_ _null_ )); +DESCR("generate headline from json"); + DATA(insert OID = 3745 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f f t f i s 2 0 3614 "3734 25" _null_ _null_ _null_ _null_ _null_ to_tsvector_byid _null_ _null_ _null_ )); DESCR("transform to tsvector"); DATA(insert OID = 3746 ( to_tsquery PGNSP PGUID 12 100 0 0 0 f f f f t f i s 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ _null_ to_tsquery_byid _null_ _null_ _null_ )); @@ -4775,6 +4793,10 @@ DATA(insert OID = 3751 ( plainto_tsquery PGNSP PGUID 12 100 0 0 0 f f f f t f s DESCR("transform to tsquery"); DATA(insert OID = 5001 ( phraseto_tsquery PGNSP PGUID 12 100 0 0 0 f f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ phraseto_tsquery _null_ _null_ _null_ )); DESCR("transform to tsquery"); +DATA(insert OID = 4209 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f f t f s s 1 0 3614 "3802" _null_ _null_ _null_ _null_ _null_ jsonb_to_tsvector _null_ _null_ _null_ )); +DESCR("transform jsonb to tsvector"); +DATA(insert OID = 4210 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f f t f s s 1 0 3614 "114" _null_ _null_ _null_ _null_ _null_ json_to_tsvector _null_ _null_ _null_ )); +DESCR("transform json to tsvector"); DATA(insert OID = 3752 ( tsvector_update_trigger PGNSP PGUID 12 1 0 0 0 f f f f f f v s 0 0 2279 "" _null_ _null_ _null_ _null_ _null_ tsvector_update_trigger_byid _null_ _null_ _null_ )); DESCR("trigger for automatic update of tsvector column"); diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h index 155650c..873e2e1 100644 --- a/src/include/tsearch/ts_type.h +++ b/src/include/tsearch/ts_type.h @@ -86,6 +86,15 @@ typedef struct #define MAXNUMPOS (256) #define LIMITPOS(x) ( ( (x) >= MAXENTRYPOS ) ? (MAXENTRYPOS-1) : (x) ) +/* + * In case if a TSVector contains several parts and we want to treat them as + * separate, it's necessary to add an artificial increment to position of each + * lexeme from every next part. It's required to avoid the situation when + * tsquery can find a phrase consisting of lexemes from two of such parts. + * TS_JUMP defined a value of this increment. + */ +#define TS_JUMP 1 + /* This struct represents a complete tsvector datum */ typedef struct { diff --git a/src/test/regress/expected/json.out b/src/test/regress/expected/json.out index 1bb8768..046ead3 100644 --- a/src/test/regress/expected/json.out +++ b/src/test/regress/expected/json.out @@ -1674,3 +1674,86 @@ select json_strip_nulls('{"a": {"b": null, "c": null}, "d": {} }'); {"a":{},"d":{}} (1 row) +-- json to tsvector +select to_tsvector('{"a": "aaa bbb ddd ccc", "b": ["eee fff ggg"], "c": {"d": "hhh iii"}}'::json); + to_tsvector +--------------------------------------------------------------------------- + 'aaa':1 'bbb':2 'ccc':4 'ddd':3 'eee':6 'fff':7 'ggg':8 'hhh':10 'iii':11 +(1 row) + +-- json to tsvector with stop words +select to_tsvector('{"a": "aaa in bbb ddd ccc", "b": ["the eee fff ggg"], "c": {"d": "hhh. iii"}}'::json); + to_tsvector +---------------------------------------------------------------------------- + 'aaa':1 'bbb':3 'ccc':5 'ddd':4 'eee':8 'fff':9 'ggg':10 'hhh':12 'iii':13 +(1 row) + +-- ts_vector corner cases +select to_tsvector('""'::json); + to_tsvector +------------- + +(1 row) + +select to_tsvector('{}'::json); + to_tsvector +------------- + +(1 row) + +select to_tsvector('[]'::json); + to_tsvector +------------- + +(1 row) + +select to_tsvector('null'::json); + to_tsvector +------------- + +(1 row) + +-- ts_headline for json +select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::json, tsquery('bbb & ddd & hhh')); + ts_headline +--------------------------------------------------------------------------------------------------------- + {"a":"aaa <b>bbb</b>","b":{"c":"ccc <b>ddd</b> fff","c1":"ccc1 ddd1"},"d":["ggg <b>hhh</b>","iii jjj"]} +(1 row) + +select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff"}, "d": ["ggg hhh", "iii jjj"]}'::json, tsquery('bbb & ddd & hhh')); + ts_headline +---------------------------------------------------------------------------------------- + {"a":"aaa <b>bbb</b>","b":{"c":"ccc <b>ddd</b> fff"},"d":["ggg <b>hhh</b>","iii jjj"]} +(1 row) + +select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::json, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >'); + ts_headline +------------------------------------------------------------------------------------------ + {"a":"aaa <bbb>","b":{"c":"ccc <ddd> fff","c1":"ccc1 ddd1"},"d":["ggg <hhh>","iii jjj"]} +(1 row) + +select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::json, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >'); + ts_headline +------------------------------------------------------------------------------------------ + {"a":"aaa <bbb>","b":{"c":"ccc <ddd> fff","c1":"ccc1 ddd1"},"d":["ggg <hhh>","iii jjj"]} +(1 row) + +-- corner cases for ts_headline with json +select ts_headline('null'::json, tsquery('aaa & bbb')); + ts_headline +------------- + null +(1 row) + +select ts_headline('{}'::json, tsquery('aaa & bbb')); + ts_headline +------------- + {} +(1 row) + +select ts_headline('[]'::json, tsquery('aaa & bbb')); + ts_headline +------------- + [] +(1 row) + diff --git a/src/test/regress/expected/jsonb.out b/src/test/regress/expected/jsonb.out index 8ec4150..7cbbcfc 100644 --- a/src/test/regress/expected/jsonb.out +++ b/src/test/regress/expected/jsonb.out @@ -3474,3 +3474,86 @@ HINT: Try using the function jsonb_set to replace key value. select jsonb_insert('{"a": {"b": "value"}}', '{a, b}', '"new_value"', true); ERROR: cannot replace existing key HINT: Try using the function jsonb_set to replace key value. +-- jsonb to tsvector +select to_tsvector('{"a": "aaa bbb ddd ccc", "b": ["eee fff ggg"], "c": {"d": "hhh iii"}}'::jsonb); + to_tsvector +--------------------------------------------------------------------------- + 'aaa':1 'bbb':2 'ccc':4 'ddd':3 'eee':6 'fff':7 'ggg':8 'hhh':10 'iii':11 +(1 row) + +-- jsonb to tsvector with stop words +select to_tsvector('{"a": "aaa in bbb ddd ccc", "b": ["the eee fff ggg"], "c": {"d": "hhh. iii"}}'::jsonb); + to_tsvector +---------------------------------------------------------------------------- + 'aaa':1 'bbb':3 'ccc':5 'ddd':4 'eee':8 'fff':9 'ggg':10 'hhh':12 'iii':13 +(1 row) + +-- ts_vector corner cases +select to_tsvector('""'::jsonb); + to_tsvector +------------- + +(1 row) + +select to_tsvector('{}'::jsonb); + to_tsvector +------------- + +(1 row) + +select to_tsvector('[]'::jsonb); + to_tsvector +------------- + +(1 row) + +select to_tsvector('null'::jsonb); + to_tsvector +------------- + +(1 row) + +-- ts_headline for jsonb +select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh')); + ts_headline +------------------------------------------------------------------------------------------------------------------ + {"a": "aaa <b>bbb</b>", "b": {"c": "ccc <b>ddd</b> fff", "c1": "ccc1 ddd1"}, "d": ["ggg <b>hhh</b>", "iii jjj"]} +(1 row) + +select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh')); + ts_headline +----------------------------------------------------------------------------------------------- + {"a": "aaa <b>bbb</b>", "b": {"c": "ccc <b>ddd</b> fff"}, "d": ["ggg <b>hhh</b>", "iii jjj"]} +(1 row) + +select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >'); + ts_headline +--------------------------------------------------------------------------------------------------- + {"a": "aaa <bbb>", "b": {"c": "ccc <ddd> fff", "c1": "ccc1 ddd1"}, "d": ["ggg <hhh>", "iii jjj"]} +(1 row) + +select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >'); + ts_headline +--------------------------------------------------------------------------------------------------- + {"a": "aaa <bbb>", "b": {"c": "ccc <ddd> fff", "c1": "ccc1 ddd1"}, "d": ["ggg <hhh>", "iii jjj"]} +(1 row) + +-- corner cases for ts_headline with jsonb +select ts_headline('null'::jsonb, tsquery('aaa & bbb')); + ts_headline +------------- + null +(1 row) + +select ts_headline('{}'::jsonb, tsquery('aaa & bbb')); + ts_headline +------------- + {} +(1 row) + +select ts_headline('[]'::jsonb, tsquery('aaa & bbb')); + ts_headline +------------- + [] +(1 row) + diff --git a/src/test/regress/sql/json.sql b/src/test/regress/sql/json.sql index 5e61922..e661f96 100644 --- a/src/test/regress/sql/json.sql +++ b/src/test/regress/sql/json.sql @@ -551,3 +551,26 @@ select json_strip_nulls('[1,{"a":1,"b":null,"c":2},3]'); -- an empty object is not null and should not be stripped select json_strip_nulls('{"a": {"b": null, "c": null}, "d": {} }'); + +-- json to tsvector +select to_tsvector('{"a": "aaa bbb ddd ccc", "b": ["eee fff ggg"], "c": {"d": "hhh iii"}}'::json); + +-- json to tsvector with stop words +select to_tsvector('{"a": "aaa in bbb ddd ccc", "b": ["the eee fff ggg"], "c": {"d": "hhh. iii"}}'::json); + +-- ts_vector corner cases +select to_tsvector('""'::json); +select to_tsvector('{}'::json); +select to_tsvector('[]'::json); +select to_tsvector('null'::json); + +-- ts_headline for json +select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::json, tsquery('bbb & ddd & hhh')); +select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff"}, "d": ["ggg hhh", "iii jjj"]}'::json, tsquery('bbb & ddd & hhh')); +select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::json, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >'); +select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::json, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >'); + +-- corner cases for ts_headline with json +select ts_headline('null'::json, tsquery('aaa & bbb')); +select ts_headline('{}'::json, tsquery('aaa & bbb')); +select ts_headline('[]'::json, tsquery('aaa & bbb')); diff --git a/src/test/regress/sql/jsonb.sql b/src/test/regress/sql/jsonb.sql index e2eaca0..2e31ee6 100644 --- a/src/test/regress/sql/jsonb.sql +++ b/src/test/regress/sql/jsonb.sql @@ -878,3 +878,26 @@ select jsonb_insert('{"a": {"b": "value"}}', '{a, c}', '"new_value"', true); select jsonb_insert('{"a": {"b": "value"}}', '{a, b}', '"new_value"'); select jsonb_insert('{"a": {"b": "value"}}', '{a, b}', '"new_value"', true); + +-- jsonb to tsvector +select to_tsvector('{"a": "aaa bbb ddd ccc", "b": ["eee fff ggg"], "c": {"d": "hhh iii"}}'::jsonb); + +-- jsonb to tsvector with stop words +select to_tsvector('{"a": "aaa in bbb ddd ccc", "b": ["the eee fff ggg"], "c": {"d": "hhh. iii"}}'::jsonb); + +-- ts_vector corner cases +select to_tsvector('""'::jsonb); +select to_tsvector('{}'::jsonb); +select to_tsvector('[]'::jsonb); +select to_tsvector('null'::jsonb); + +-- ts_headline for jsonb +select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh')); +select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh')); +select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >'); +select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >'); + +-- corner cases for ts_headline with jsonb +select ts_headline('null'::jsonb, tsquery('aaa & bbb')); +select ts_headline('{}'::jsonb, tsquery('aaa & bbb')); +select ts_headline('[]'::jsonb, tsquery('aaa & bbb'));
diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c index 6a7aab2..8fbe08d 100644 --- a/src/backend/utils/adt/jsonfuncs.c +++ b/src/backend/utils/adt/jsonfuncs.c @@ -42,6 +42,8 @@ #define JB_PATH_CREATE_OR_INSERT \ (JB_PATH_INSERT_BEFORE | JB_PATH_INSERT_AFTER | JB_PATH_CREATE) +#define is_jsonb_data(type) (type == WJB_KEY || type == WJB_VALUE || type == WJB_ELEM) + /* state for json_object_keys */ typedef struct OkeysState { @@ -52,6 +54,23 @@ typedef struct OkeysState int sent_count; } OkeysState; +/* state for iterate_json function */ +typedef struct IterateJsonState +{ + JsonLexContext *lex; + JsonIterateAction action; /* an action that will be applied to each json value */ + void *action_state; /* any necessary context for iteration */ +} IterateJsonState; + +/* state for transform_json function */ +typedef struct TransformJsonState +{ + JsonLexContext *lex; + StringInfo strval; /* resulting json */ + JsonTransformAction action; /* an action that will be applied to each json value */ + void *action_state; /* any necessary context for transformation */ +} TransformJsonState; + /* state for json_get* functions */ typedef struct GetState { @@ -271,6 +290,18 @@ static void setPathArray(JsonbIterator **it, Datum *path_elems, int level, Jsonb *newval, uint32 nelems, int op_type); static void addJsonbToParseState(JsonbParseState **jbps, Jsonb *jb); +/* function supporting iterate_json(b) */ +static void apply_action(void *state, char *token, JsonTokenType tokentype); + +/* function supporting transform_json(b) */ +static void transform_object_start(void *state); +static void transform_object_end(void *state); +static void transform_array_start(void *state); +static void transform_array_end(void *state); +static void transform_object_field_start(void *state, char *fname, bool isnull); +static void transform_array_element_start(void *state, bool isnull); +static void transform_scalar(void *state, char *token, JsonTokenType tokentype); + /* * SQL function json_object_keys @@ -4130,3 +4161,202 @@ setPathArray(JsonbIterator **it, Datum *path_elems, bool *path_nulls, } } } + +/* + * Iterate over jsonb string values or elements, and pass them together with + * an iteration state to a specified JsonIterateAction. + */ +void +iterate_jsonb_values(Jsonb *jb, void *state, JsonIterateAction action) +{ + JsonbIterator *it; + JsonbValue v; + JsonbIteratorToken type; + + it = JsonbIteratorInit(&jb->root); + + while ((type = JsonbIteratorNext(&it, &v, false)) != WJB_DONE) + { + if ((type == WJB_VALUE || type == WJB_ELEM) && v.type == jbvString) + { + action(state, v.val.string.val, v.val.string.len); + } + } +} + +/* + * Iterate over json string values or elements, and pass them together with an + * iteration state to a specified JsonIterateAction. + */ +void +iterate_json_values(text *json, void *action_state, JsonIterateAction action) +{ + JsonLexContext *lex = makeJsonLexContext(json, true); + JsonSemAction *sem = palloc0(sizeof(JsonSemAction)); + IterateJsonState *state = palloc0(sizeof(IterateJsonState)); + + state->lex = lex; + state->action = action; + state->action_state = action_state; + + sem->semstate = (void *) state; + sem->scalar = apply_action; + + pg_parse_json(lex, sem); +} + +/* + * An auxiliary function for iterate_json_values to invoke a specified + * JsonIterateAction. + */ +static void +apply_action(void *state, char *token, JsonTokenType tokentype) +{ + IterateJsonState *_state = (IterateJsonState *) state; + if (tokentype == JSON_TOKEN_STRING) + (*_state->action) (_state->action_state, token, strlen(token)); +} + +/* + * Iterate over a jsonb, and apply a specified JsonTransformAction to every + * string value or element. Any necessary context for a JsonTransformAction can + * be passed in the action_state variable. Function returns a copy of an original jsonb + * object with transformed values. + */ +Jsonb * +transform_jsonb_values(Jsonb *jsonb, void *action_state, JsonTransformAction transform_action) +{ + JsonbIterator *it; + JsonbValue v, *res = NULL; + JsonbIteratorToken type; + JsonbParseState *st = NULL; + text *out; + bool is_scalar = false; + + it = JsonbIteratorInit(&jsonb->root); + is_scalar = it->isScalar; + + while ((type = JsonbIteratorNext(&it, &v, false)) != WJB_DONE) + { + if ((type == WJB_VALUE || type == WJB_ELEM) && v.type == jbvString) + { + out = transform_action(action_state, v.val.string.val, v.val.string.len); + v.val.string.val = VARDATA_ANY(out); + v.val.string.len = VARSIZE_ANY_EXHDR(out); + res = pushJsonbValue(&st, type, type < WJB_BEGIN_ARRAY ? &v : NULL); + } + else + { + res = pushJsonbValue(&st, type, is_jsonb_data(type) ? &v : NULL); + } + } + + if (res->type == jbvArray) + res->val.array.rawScalar = is_scalar; + + return JsonbValueToJsonb(res); +} + +/* + * Iterate over a json, and apply a specified JsonTransformAction to every + * string value or element. Any necessary context for a JsonTransformAction can + * be passed in the action_state variable. Function returns a StringInfo, which + * is a copy of an original json with transformed values. + */ +text * +transform_json_values(text *json, void *action_state, JsonTransformAction transform_action) +{ + JsonLexContext *lex = makeJsonLexContext(json, true); + JsonSemAction *sem = palloc0(sizeof(JsonSemAction)); + TransformJsonState *state = palloc0(sizeof(TransformJsonState)); + + state->lex = lex; + state->strval = makeStringInfo(); + state->action = transform_action; + state->action_state = action_state; + + sem->semstate = (void *) state; + sem->scalar = transform_scalar; + sem->object_start = transform_object_start; + sem->object_end = transform_object_end; + sem->array_start = transform_array_start; + sem->array_end = transform_array_end; + sem->scalar = transform_scalar; + sem->array_element_start = transform_array_element_start; + sem->object_field_start = transform_object_field_start; + + pg_parse_json(lex, sem); + + return cstring_to_text_with_len(state->strval->data, state->strval->len); +} + +/* + * Set of auxiliary functions for transform_json to invoke a specified + * JsonTransformAction for all values and left everything else untouched. + */ +static void +transform_object_start(void *state) +{ + TransformJsonState *_state = (TransformJsonState *) state; + appendStringInfoCharMacro(_state->strval, '{'); +} + +static void +transform_object_end(void *state) +{ + TransformJsonState *_state = (TransformJsonState *) state; + appendStringInfoCharMacro(_state->strval, '}'); +} + +static void +transform_array_start(void *state) +{ + TransformJsonState *_state = (TransformJsonState *) state; + appendStringInfoCharMacro(_state->strval, '['); +} + +static void +transform_array_end(void *state) +{ + TransformJsonState *_state = (TransformJsonState *) state; + appendStringInfoCharMacro(_state->strval, ']'); +} + +static void +transform_object_field_start(void *state, char *fname, bool isnull) +{ + TransformJsonState *_state = (TransformJsonState *) state; + + if (_state->strval->data[_state->strval->len - 1] != '{') + appendStringInfoCharMacro(_state->strval, ','); + + /* + * Unfortunately we don't have the quoted and escaped string any more, so + * we have to re-escape it. + */ + escape_json(_state->strval, fname); + appendStringInfoCharMacro(_state->strval, ':'); +} + +static void +transform_array_element_start(void *state, bool isnull) +{ + TransformJsonState *_state = (TransformJsonState *) state; + + if (_state->strval->data[_state->strval->len - 1] != '[') + appendStringInfoCharMacro(_state->strval, ','); +} + +static void +transform_scalar(void *state, char *token, JsonTokenType tokentype) +{ + TransformJsonState *_state = (TransformJsonState *) state; + + if (tokentype == JSON_TOKEN_STRING) + { + text *out = (*_state->action) (_state->action_state, token, strlen(token)); + escape_json(_state->strval, text_to_cstring(out)); + } + else + appendStringInfoString(_state->strval, token); +} diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h index 411e158..2d0ee19 100644 --- a/src/include/utils/jsonb.h +++ b/src/include/utils/jsonb.h @@ -351,6 +351,12 @@ typedef struct JsonbIterator struct JsonbIterator *parent; } JsonbIterator; +/* an action that will be applied to each value in iterate_json(b) functions */ +typedef void (*JsonIterateAction) (void *state, char *elem_value, int elem_len); + +/* an action that will be applied to each value in transform_json(b) functions */ +typedef text * (*JsonTransformAction) (void *state, char *elem_value, int elem_len); + /* Support functions */ extern uint32 getJsonbOffset(const JsonbContainer *jc, int index); @@ -377,5 +383,12 @@ extern char *JsonbToCString(StringInfo out, JsonbContainer *in, extern char *JsonbToCStringIndent(StringInfo out, JsonbContainer *in, int estimated_len); +extern void iterate_jsonb_values(Jsonb *jb, void *state, JsonIterateAction action); +extern void iterate_json_values(text *json, void *action_state, JsonIterateAction action); +extern Jsonb *transform_jsonb_values(Jsonb *jsonb, void *action_state, + JsonTransformAction transform_action); +extern text *transform_json_values(text *json, void *action_state, + JsonTransformAction transform_action); + #endif /* __JSONB_H__ */
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers