Hi everyone,The code in its current state looks messy and way too complicated; there're lots of interleaving code branches. Thus, I decided to split gettoken_query() into three independent tokenizers for phrase, web and original (to_tsquery()) syntaxes. Documentation is included. Any feedback is very welcome.
-- Dmitry Ivanov Postgres Professional: http://www.postgrespro.com The Russian Postgres Company
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 5abb1c46fb..c3b7be6e4e 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -9609,6 +9609,18 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple <entry><literal>phraseto_tsquery('english', 'The Fat Rats')</literal></entry> <entry><literal>'fat' <-> 'rat'</literal></entry> </row> + <row> + <entry> + <indexterm> + <primary>websearch_to_tsquery</primary> + </indexterm> + <literal><function>websearch_to_tsquery(<optional> <replaceable class="parameter">config</replaceable> <type>regconfig</type> , </optional> <replaceable class="parameter">query</replaceable> <type>text</type>)</function></literal> + </entry> + <entry><type>tsquery</type></entry> + <entry>produce <type>tsquery</type> from a web search style query</entry> + <entry><literal>websearch_to_tsquery('english', '"fat rat" or rat')</literal></entry> + <entry><literal>'fat' <-> 'rat' | 'rat'</literal></entry> + </row> <row> <entry> <indexterm> diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 610b7bf033..19f58511c8 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -797,13 +797,16 @@ UPDATE tt SET ti = <para> <productname>PostgreSQL</productname> provides the functions <function>to_tsquery</function>, - <function>plainto_tsquery</function>, and - <function>phraseto_tsquery</function> + <function>plainto_tsquery</function>, + <function>phraseto_tsquery</function> and + <function>websearch_to_tsquery</function> for converting a query to the <type>tsquery</type> data type. <function>to_tsquery</function> offers access to more features than either <function>plainto_tsquery</function> or - <function>phraseto_tsquery</function>, but it is less forgiving - about its input. + <function>phraseto_tsquery</function>, but it is less forgiving about its + input. <function>websearch_to_tsquery</function> is a simplified version + of <function>to_tsquery</function> with an alternative syntax, similar + to the one used by web search engines. </para> <indexterm> @@ -962,6 +965,87 @@ SELECT phraseto_tsquery('english', 'The Fat & Rats:C'); </screen> </para> +<synopsis> +websearch_to_tsquery(<optional> <replaceable class="parameter">config</replaceable> <type>regconfig</type>, </optional> <replaceable class="parameter">querytext</replaceable> <type>text</type>) returns <type>tsquery</type> +</synopsis> + + <para> + <function>websearch_to_tsquery</function> creates a <type>tsquery</type> + value from <replaceable>querytext</replaceable> using an alternative + syntax in which simple unformatted text is a valid query. + Unlike <function>plainto_tsquery</function> + and <function>phraseto_tsquery</function>, it also recognizes certain + operators. Moreover, this function should never raise syntax errors, + which makes it possible to use raw user-supplied input for search. + The following syntax is supported: + <itemizedlist spacing="compact" mark="bullet"> + <listitem> + <para> + <literal>unquoted text</literal>: text not inside quote marks will be + converted to terms separated by <literal>&</literal> operators, as + if processed by + <function>plainto_tsquery</function>. + </para> + </listitem> + <listitem> + <para> + <literal>"quoted text"</literal>: text inside quote marks will be + converted to terms separated by <literal><-></literal> + operators, as if processed by <function>phraseto_tsquery</function>. + </para> + </listitem> + <listitem> + <para> + <literal>OR</literal>: logical or will be converted to + the <literal>|</literal> operator. + </para> + </listitem> + <listitem> + <para> + <literal>-</literal>: the logical not operator, converted to the + the <literal>!</literal> operator. + </para> + </listitem> + </itemizedlist> + </para> + <para> + Examples: + <screen> + select websearch_to_tsquery('english', 'The fat rats'); + websearch_to_tsquery + ----------------- + 'fat' & 'rat' + (1 row) + </screen> + <screen> + select websearch_to_tsquery('english', '"supernovae stars" -crab'); + websearch_to_tsquery + ---------------------------------- + 'supernova' <-> 'star' & !'crab' + (1 row) + </screen> + <screen> + select websearch_to_tsquery('english', '"sad cat" or "fat rat"'); + websearch_to_tsquery + ----------------------------------- + 'sad' <-> 'cat' | 'fat' <-> 'rat' + (1 row) + </screen> + <screen> + select websearch_to_tsquery('english', 'signal -"segmentation fault"'); + websearch_to_tsquery + --------------------------------------- + 'signal' & !( 'segment' <-> 'fault' ) + (1 row) + </screen> + <screen> + select websearch_to_tsquery('english', '""" )( dummy \\ query <->'); + websearch_to_tsquery + ---------------------- + 'dummi' & 'queri' + (1 row) + </screen> + </para> </sect2> <sect2 id="textsearch-ranking"> diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c index ea5947a3a8..6055fb6b4e 100644 --- a/src/backend/tsearch/to_tsany.c +++ b/src/backend/tsearch/to_tsany.c @@ -490,7 +490,7 @@ to_tsquery_byid(PG_FUNCTION_ARGS) query = parse_tsquery(text_to_cstring(in), pushval_morph, PointerGetDatum(&data), - false); + 0); PG_RETURN_TSQUERY(query); } @@ -520,7 +520,7 @@ plainto_tsquery_byid(PG_FUNCTION_ARGS) query = parse_tsquery(text_to_cstring(in), pushval_morph, PointerGetDatum(&data), - true); + P_TSQ_PLAIN); PG_RETURN_POINTER(query); } @@ -551,7 +551,7 @@ phraseto_tsquery_byid(PG_FUNCTION_ARGS) query = parse_tsquery(text_to_cstring(in), pushval_morph, PointerGetDatum(&data), - true); + P_TSQ_PLAIN); PG_RETURN_TSQUERY(query); } @@ -567,3 +567,35 @@ phraseto_tsquery(PG_FUNCTION_ARGS) ObjectIdGetDatum(cfgId), PointerGetDatum(in))); } + +Datum +websearch_to_tsquery_byid(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(1); + MorphOpaque data; + TSQuery query = NULL; + + data.cfg_id = PG_GETARG_OID(0); + + data.qoperator = OP_AND; + + query = parse_tsquery(text_to_cstring(in), + pushval_morph, + PointerGetDatum(&data), + P_TSQ_WEB); + + PG_RETURN_TSQUERY(query); +} + +Datum +websearch_to_tsquery(PG_FUNCTION_ARGS) +{ + text *in = PG_GETARG_TEXT_PP(0); + Oid cfgId; + + cfgId = getTSCurrentConfig(true); + PG_RETURN_DATUM(DirectFunctionCall2(websearch_to_tsquery_byid, + ObjectIdGetDatum(cfgId), + PointerGetDatum(in))); + +} diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index 1ccbf79030..d5df8b5506 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -32,14 +32,53 @@ const int tsearch_op_priority[OP_COUNT] = 3 /* OP_PHRASE */ }; +/* + * parser's states + */ +typedef enum +{ + WAITOPERAND = 1, + WAITOPERATOR = 2, + WAITFIRSTOPERAND = 3 +} ts_parserstate; + +/* + * token types for parsing + */ +typedef enum +{ + PT_END = 0, + PT_ERR = 1, + PT_VAL = 2, + PT_OPR = 3, + PT_OPEN = 4, + PT_CLOSE = 5 +} ts_tokentype; + +/* + * get token from query string + * + * *operator is filled in with OP_* when return values is PT_OPR, + * but *weight could contain a distance value in case of phrase operator. + * *strval, *lenval and *weight are filled in when return value is PT_VAL + * + */ +typedef ts_tokentype ts_tokenizer(TSQueryParserState state, int8 *operator, + int *lenval, char **strval, + int16 *weight, bool *prefix); + struct TSQueryParserStateData { - /* State for gettoken_query */ + /* Tokenizer used for parsing tsquery */ + ts_tokenizer *gettoken; + + /* State of tokenizer function */ char *buffer; /* entire string we are scanning */ char *buf; /* current scan point */ - int state; int count; /* nesting count, incremented by (, * decremented by ) */ + bool in_quotes; /* phrase in quotes "" */ + ts_parserstate state; /* polish (prefix) notation in list, filled in by push* functions */ List *polstr; @@ -57,12 +96,6 @@ struct TSQueryParserStateData TSVectorParseState valstate; }; -/* parser's states */ -#define WAITOPERAND 1 -#define WAITOPERATOR 2 -#define WAITFIRSTOPERAND 3 -#define WAITSINGLEOPERAND 4 - /* * subroutine to parse the modifiers (weight and prefix flag currently) * part, like ':AB*' of a query. @@ -198,35 +231,34 @@ err: } /* - * token types for parsing + * Parse OR operator used in websearch_to_tsquery(). */ -typedef enum +static bool +parse_or_operator(TSQueryParserState state) { - PT_END = 0, - PT_ERR = 1, - PT_VAL = 2, - PT_OPR = 3, - PT_OPEN = 4, - PT_CLOSE = 5 -} ts_tokentype; + char *buf = state->buf; + + if (state->in_quotes) + return false; + + return (t_iseq(&buf[0], 'o') || t_iseq(&buf[0], 'O')) && + (t_iseq(&buf[1], 'r') || t_iseq(&buf[1], 'R')) && + (buf[2] != '\0' && + !t_iseq(&buf[2], '-') && + !t_iseq(&buf[2], '_') && + !t_isalpha(&buf[2]) && + !t_isdigit(&buf[2])); +} -/* - * get token from query string - * - * *operator is filled in with OP_* when return values is PT_OPR, - * but *weight could contain a distance value in case of phrase operator. - * *strval, *lenval and *weight are filled in when return value is PT_VAL - * - */ static ts_tokentype -gettoken_query(TSQueryParserState state, - int8 *operator, - int *lenval, char **strval, int16 *weight, bool *prefix) +gettoken_query_standard(TSQueryParserState state, int8 *operator, + int *lenval, char **strval, + int16 *weight, bool *prefix) { *weight = 0; *prefix = false; - while (1) + while (true) { switch (state->state) { @@ -234,17 +266,20 @@ gettoken_query(TSQueryParserState state, case WAITOPERAND: if (t_iseq(state->buf, '!')) { - (state->buf)++; /* can safely ++, t_iseq guarantee that - * pg_mblen()==1 */ - *operator = OP_NOT; + state->buf++; state->state = WAITOPERAND; + + if (state->in_quotes) + continue; + + *operator = OP_NOT; return PT_OPR; } else if (t_iseq(state->buf, '(')) { - state->count++; - (state->buf)++; + state->buf++; state->state = WAITOPERAND; + state->count++; return PT_OPEN; } else if (t_iseq(state->buf, ':')) @@ -256,10 +291,7 @@ gettoken_query(TSQueryParserState state, } else if (!t_isspace(state->buf)) { - /* - * We rely on the tsvector parser to parse the value for - * us - */ + /* We rely on the tsvector parser to parse the value for us */ reset_tsvector_parser(state->valstate, state->buf); if (gettoken_tsvector(state->valstate, strval, lenval, NULL, NULL, &state->buf)) { @@ -268,7 +300,9 @@ gettoken_query(TSQueryParserState state, return PT_VAL; } else if (state->state == WAITFIRSTOPERAND) + { return PT_END; + } else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -276,58 +310,209 @@ gettoken_query(TSQueryParserState state, state->buffer))); } break; + case WAITOPERATOR: if (t_iseq(state->buf, '&')) { + state->buf++; state->state = WAITOPERAND; *operator = OP_AND; - (state->buf)++; return PT_OPR; } else if (t_iseq(state->buf, '|')) { + state->buf++; state->state = WAITOPERAND; *operator = OP_OR; - (state->buf)++; return PT_OPR; } else if (t_iseq(state->buf, '<')) { - state->state = WAITOPERAND; - *operator = OP_PHRASE; /* weight var is used as storage for distance */ state->buf = parse_phrase_operator(state->buf, weight); + state->state = WAITOPERAND; + *operator = OP_PHRASE; if (*weight < 0) return PT_ERR; return PT_OPR; } else if (t_iseq(state->buf, ')')) { - (state->buf)++; + state->buf++; state->count--; return (state->count < 0) ? PT_ERR : PT_CLOSE; } - else if (*(state->buf) == '\0') - return (state->count) ? PT_ERR : PT_END; + else if (*state->buf == '\0') + { + return state->count ? PT_ERR : PT_END; + } else if (!t_isspace(state->buf)) + { return PT_ERR; + } break; - case WAITSINGLEOPERAND: - if (*(state->buf) == '\0') + } + + state->buf += pg_mblen(state->buf); + } +} + +static ts_tokentype +gettoken_query_websearch(TSQueryParserState state, int8 *operator, + int *lenval, char **strval, + int16 *weight, bool *prefix) +{ + *weight = 0; + *prefix = false; + + while (true) + { + switch (state->state) + { + case WAITFIRSTOPERAND: + case WAITOPERAND: + if (t_iseq(state->buf, '-')) + { + state->buf++; + state->state = WAITOPERAND; + + if (state->in_quotes) + continue; + + *operator = OP_NOT; + return PT_OPR; + } + else if (t_iseq(state->buf, '"')) + { + state->buf++; + + if (!state->in_quotes) + { + state->state = WAITOPERAND; + + if (strchr(state->buf, '"')) + { + /* quoted text should be ordered <-> */ + state->in_quotes = true; + return PT_OPEN; + } + + /* web search tolerates missing quotes */ + continue; + } + else + { + /* we have to provide an operand */ + state->in_quotes = false; + state->state = WAITOPERATOR; + pushStop(state); + return PT_CLOSE; + } + } + else if (ISOPERATOR(state->buf)) + { + /* or else gettoken_tsvector() will raise an error */ + state->buf++; + state->state = WAITOPERAND; + continue; + } + else if (!t_isspace(state->buf)) + { + /* We rely on the tsvector parser to parse the value for us */ + reset_tsvector_parser(state->valstate, state->buf); + if (gettoken_tsvector(state->valstate, strval, lenval, NULL, NULL, &state->buf)) + { + state->state = WAITOPERATOR; + return PT_VAL; + } + else if (state->state == WAITFIRSTOPERAND) + { + return PT_END; + } + else + { + /* finally, we have to provide an operand */ + pushStop(state); + return PT_END; + } + } + break; + + case WAITOPERATOR: + if (t_iseq(state->buf, '"')) + { + if (!state->in_quotes) + { + /* + * put implicit AND after an operand + * and handle this quote in WAITOPERAND + */ + state->state = WAITOPERAND; + *operator = OP_AND; + return PT_OPR; + } + else + { + state->buf++; + + /* just close quotes */ + state->in_quotes = false; + return PT_CLOSE; + } + } + else if (parse_or_operator(state)) + { + state->buf += 2; /* strlen("OR") */ + state->state = WAITOPERAND; + *operator = OP_OR; + return PT_OPR; + } + else if (*state->buf == '\0') + { return PT_END; - *strval = state->buf; - *lenval = strlen(state->buf); - state->buf += strlen(state->buf); - state->count++; - return PT_VAL; - default: - return PT_ERR; + } + else if (!t_isspace(state->buf)) + { + if (state->in_quotes) + { + /* put implicit <-> after an operand */ + *operator = OP_PHRASE; + *weight = 1; + } + else + { + /* put implicit AND after an operand */ + *operator = OP_AND; + } + + state->state = WAITOPERAND; + return PT_OPR; + } break; } + state->buf += pg_mblen(state->buf); } } +static ts_tokentype +gettoken_query_plain(TSQueryParserState state, int8 *operator, + int *lenval, char **strval, + int16 *weight, bool *prefix) +{ + *weight = 0; + *prefix = false; + + if (*state->buf == '\0') + return PT_END; + + *strval = state->buf; + *lenval = strlen(state->buf); + state->buf += strlen(state->buf); + state->count++; + return PT_VAL; +} + /* * Push an operator to state->polstr */ @@ -489,7 +674,9 @@ makepol(TSQueryParserState state, /* since this function recurses, it could be driven to stack overflow */ check_stack_depth(); - while ((type = gettoken_query(state, &operator, &lenval, &strval, &weight, &prefix)) != PT_END) + while ((type = state->gettoken(state, &operator, + &lenval, &strval, + &weight, &prefix)) != PT_END) { switch (type) { @@ -605,7 +792,7 @@ TSQuery parse_tsquery(char *buf, PushFunction pushval, Datum opaque, - bool isplain) + int flags) { struct TSQueryParserStateData state; int i; @@ -613,17 +800,38 @@ parse_tsquery(char *buf, int commonlen; QueryItem *ptr; ListCell *cell; - bool needcleanup; + bool needcleanup, + is_plain, + is_web; + int tsv_flags = P_TSV_OPR_IS_DELIM | P_TSV_IS_TSQUERY; + + is_plain = (flags & P_TSQ_PLAIN) != 0; + is_web = (flags & P_TSQ_WEB) != 0; + + /* plain should not be used with web */ + Assert(!(is_plain && is_web)); + + if (is_web) + tsv_flags |= P_TSV_IS_WEB; + + /* select suitable tokenizer */ + if (is_plain) + state.gettoken = gettoken_query_plain; + else if (is_web) + state.gettoken = gettoken_query_websearch; + else + state.gettoken = gettoken_query_standard; /* init state */ state.buffer = buf; state.buf = buf; - state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND; state.count = 0; + state.in_quotes = false; + state.state = WAITFIRSTOPERAND; state.polstr = NIL; /* init value parser's state */ - state.valstate = init_tsvector_parser(state.buffer, true, true); + state.valstate = init_tsvector_parser(state.buffer, tsv_flags); /* init list of operand */ state.sumlen = 0; @@ -716,7 +924,7 @@ tsqueryin(PG_FUNCTION_ARGS) { char *in = PG_GETARG_CSTRING(0); - PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, PointerGetDatum(NULL), false)); + PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, PointerGetDatum(NULL), 0)); } /* diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index 64e02ef434..7a27bd12a3 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -200,7 +200,7 @@ tsvectorin(PG_FUNCTION_ARGS) char *cur; int buflen = 256; /* allocated size of tmpbuf */ - state = init_tsvector_parser(buf, false, false); + state = init_tsvector_parser(buf, 0); arrlen = 64; arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen); diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c index 7367ba6a40..fed411a842 100644 --- a/src/backend/utils/adt/tsvector_parser.c +++ b/src/backend/utils/adt/tsvector_parser.c @@ -33,6 +33,7 @@ struct TSVectorParseStateData int eml; /* max bytes per character */ bool oprisdelim; /* treat ! | * ( ) as delimiters? */ bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */ + bool is_web; /* we're in websearch_to_tsquery() */ }; @@ -42,7 +43,7 @@ struct TSVectorParseStateData * ! | & ( ) */ TSVectorParseState -init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery) +init_tsvector_parser(char *input, int flags) { TSVectorParseState state; @@ -52,8 +53,9 @@ init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery) state->len = 32; state->word = (char *) palloc(state->len); state->eml = pg_database_encoding_max_length(); - state->oprisdelim = oprisdelim; - state->is_tsquery = is_tsquery; + state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0; + state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0; + state->is_web = (flags & P_TSV_IS_WEB) != 0; return state; } @@ -89,16 +91,6 @@ do { \ } \ } while (0) -/* phrase operator begins with '<' */ -#define ISOPERATOR(x) \ - ( pg_mblen(x) == 1 && ( *(x) == '!' || \ - *(x) == '&' || \ - *(x) == '|' || \ - *(x) == '(' || \ - *(x) == ')' || \ - *(x) == '<' \ - ) ) - /* Fills gettoken_tsvector's output parameters, and returns true */ #define RETURN_TOKEN \ do { \ @@ -183,14 +175,15 @@ gettoken_tsvector(TSVectorParseState state, { if (*(state->prsbuf) == '\0') return false; - else if (t_iseq(state->prsbuf, '\'')) + else if (!state->is_web && t_iseq(state->prsbuf, '\'')) statecode = WAITENDCMPLX; - else if (t_iseq(state->prsbuf, '\\')) + else if (!state->is_web && t_iseq(state->prsbuf, '\\')) { statecode = WAITNEXTCHAR; oldstate = WAITENDWORD; } - else if (state->oprisdelim && ISOPERATOR(state->prsbuf)) + else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) || + (state->is_web && t_iseq(state->prsbuf, '"'))) PRSSYNTAXERROR; else if (!t_isspace(state->prsbuf)) { @@ -217,13 +210,14 @@ gettoken_tsvector(TSVectorParseState state, } else if (statecode == WAITENDWORD) { - if (t_iseq(state->prsbuf, '\\')) + if (!state->is_web && t_iseq(state->prsbuf, '\\')) { statecode = WAITNEXTCHAR; oldstate = WAITENDWORD; } else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || - (state->oprisdelim && ISOPERATOR(state->prsbuf))) + (state->oprisdelim && ISOPERATOR(state->prsbuf)) || + (state->is_web && t_iseq(state->prsbuf, '"'))) { RESIZEPRSBUF; if (curpos == state->word) @@ -250,11 +244,11 @@ gettoken_tsvector(TSVectorParseState state, } else if (statecode == WAITENDCMPLX) { - if (t_iseq(state->prsbuf, '\'')) + if (!state->is_web && t_iseq(state->prsbuf, '\'')) { statecode = WAITCHARCMPLX; } - else if (t_iseq(state->prsbuf, '\\')) + else if (!state->is_web && t_iseq(state->prsbuf, '\\')) { statecode = WAITNEXTCHAR; oldstate = WAITENDCMPLX; @@ -270,7 +264,7 @@ gettoken_tsvector(TSVectorParseState state, } else if (statecode == WAITCHARCMPLX) { - if (t_iseq(state->prsbuf, '\'')) + if (!state->is_web && t_iseq(state->prsbuf, '\'')) { RESIZEPRSBUF; COPYCHAR(curpos, state->prsbuf); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 90d994c71a..560416636b 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -4966,6 +4966,8 @@ DATA(insert OID = 3747 ( plainto_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f i s DESCR("transform to tsquery"); DATA(insert OID = 5006 ( phraseto_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f i s 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ _null_ phraseto_tsquery_byid _null_ _null_ _null_ )); DESCR("transform to tsquery"); +DATA(insert OID = 8889 ( websearch_to_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f i s 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ _null_ websearch_to_tsquery_byid _null_ _null_ _null_ )); +DESCR("transform to tsquery"); DATA(insert OID = 3749 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3614 "25" _null_ _null_ _null_ _null_ _null_ to_tsvector _null_ _null_ _null_ )); DESCR("transform to tsvector"); DATA(insert OID = 3750 ( to_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ to_tsquery _null_ _null_ _null_ )); @@ -4974,6 +4976,8 @@ DATA(insert OID = 3751 ( plainto_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f s s DESCR("transform to tsquery"); DATA(insert OID = 5001 ( phraseto_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ phraseto_tsquery _null_ _null_ _null_ )); DESCR("transform to tsquery"); +DATA(insert OID = 8890 ( websearch_to_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ websearch_to_tsquery _null_ _null_ _null_ )); +DESCR("transform to tsquery"); DATA(insert OID = 4209 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3614 "3802" _null_ _null_ _null_ _null_ _null_ jsonb_to_tsvector _null_ _null_ _null_ )); DESCR("transform jsonb to tsvector"); DATA(insert OID = 4210 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3614 "114" _null_ _null_ _null_ _null_ _null_ json_to_tsvector _null_ _null_ _null_ )); diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h index f8ddce5ecb..73e969fe9c 100644 --- a/src/include/tsearch/ts_utils.h +++ b/src/include/tsearch/ts_utils.h @@ -25,9 +25,11 @@ struct TSVectorParseStateData; /* opaque struct in tsvector_parser.c */ typedef struct TSVectorParseStateData *TSVectorParseState; -extern TSVectorParseState init_tsvector_parser(char *input, - bool oprisdelim, - bool is_tsquery); +#define P_TSV_OPR_IS_DELIM (1 << 0) +#define P_TSV_IS_TSQUERY (1 << 1) +#define P_TSV_IS_WEB (1 << 2) + +extern TSVectorParseState init_tsvector_parser(char *input, int flags); extern void reset_tsvector_parser(TSVectorParseState state, char *input); extern bool gettoken_tsvector(TSVectorParseState state, char **token, int *len, @@ -35,6 +37,16 @@ extern bool gettoken_tsvector(TSVectorParseState state, char **endptr); extern void close_tsvector_parser(TSVectorParseState state); +/* phrase operator begins with '<' */ +#define ISOPERATOR(x) \ + ( pg_mblen(x) == 1 && ( *(x) == '!' || \ + *(x) == '&' || \ + *(x) == '|' || \ + *(x) == '(' || \ + *(x) == ')' || \ + *(x) == '<' \ + ) ) + /* parse_tsquery */ struct TSQueryParserStateData; /* private in backend/utils/adt/tsquery.c */ @@ -46,9 +58,13 @@ typedef void (*PushFunction) (Datum opaque, TSQueryParserState state, * QueryOperand struct */ bool prefix); +#define P_TSQ_PLAIN (1 << 0) +#define P_TSQ_WEB (1 << 1) + extern TSQuery parse_tsquery(char *buf, - PushFunction pushval, - Datum opaque, bool isplain); + PushFunction pushval, + Datum opaque, + int flags); /* Functions for use by PushFunction implementations */ extern void pushValue(TSQueryParserState state, diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index d63fb12f1d..37825a1130 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -1672,3 +1672,432 @@ select * from phrase_index_test where fts @@ phraseto_tsquery('english', 'fat ca (1 row) set enable_seqscan = on; +-- test websearch_to_tsquery function +select websearch_to_tsquery('simple', 'I have a fat:*ABCD cat'); + websearch_to_tsquery +--------------------------------------------- + 'i' & 'have' & 'a' & 'fat' & 'abcd' & 'cat' +(1 row) + +select websearch_to_tsquery('simple', 'orange:**AABBCCDD'); + websearch_to_tsquery +----------------------- + 'orange' & 'aabbccdd' +(1 row) + +select websearch_to_tsquery('simple', 'fat:A!cat:B|rat:C<'); + websearch_to_tsquery +----------------------------------------- + 'fat' & 'a' & 'cat' & 'b' & 'rat' & 'c' +(1 row) + +select websearch_to_tsquery('simple', 'fat:A : cat:B'); + websearch_to_tsquery +--------------------------- + 'fat' & 'a' & 'cat' & 'b' +(1 row) + +select websearch_to_tsquery('simple', 'fat*rat'); + websearch_to_tsquery +---------------------- + 'fat' & 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat-rat'); + websearch_to_tsquery +--------------------------- + 'fat-rat' & 'fat' & 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat_rat'); + websearch_to_tsquery +---------------------- + 'fat' & 'rat' +(1 row) + +-- weights are completely ignored +select websearch_to_tsquery('simple', 'abc : def'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('simple', 'abc:def'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('simple', 'a:::b'); + websearch_to_tsquery +---------------------- + 'a' & 'b' +(1 row) + +select websearch_to_tsquery('simple', 'abc:d'); + websearch_to_tsquery +---------------------- + 'abc' & 'd' +(1 row) + +select websearch_to_tsquery('simple', ':'); +NOTICE: text-search query contains only stop words or doesn't contain lexemes, ignored + websearch_to_tsquery +---------------------- + +(1 row) + +-- these operators are ignored +select websearch_to_tsquery('simple', 'abc & def'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('simple', 'abc | def'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('simple', 'abc <-> def'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('simple', 'abc (pg or class)'); + websearch_to_tsquery +------------------------ + 'abc' & 'pg' | 'class' +(1 row) + +-- NOT is ignored in quotes +select websearch_to_tsquery('english', 'My brand new smartphone'); + websearch_to_tsquery +------------------------------- + 'brand' & 'new' & 'smartphon' +(1 row) + +select websearch_to_tsquery('english', 'My brand "new smartphone"'); + websearch_to_tsquery +--------------------------------- + 'brand' & 'new' <-> 'smartphon' +(1 row) + +select websearch_to_tsquery('english', 'My brand "new -smartphone"'); + websearch_to_tsquery +--------------------------------- + 'brand' & 'new' <-> 'smartphon' +(1 row) + +-- test OR operator +select websearch_to_tsquery('simple', 'cat or rat'); + websearch_to_tsquery +---------------------- + 'cat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'cat OR rat'); + websearch_to_tsquery +---------------------- + 'cat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'cat "OR" rat'); + websearch_to_tsquery +---------------------- + 'cat' & 'or' & 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'cat OR'); + websearch_to_tsquery +---------------------- + 'cat' & 'or' +(1 row) + +select websearch_to_tsquery('simple', 'OR rat'); + websearch_to_tsquery +---------------------- + 'or' & 'rat' +(1 row) + +select websearch_to_tsquery('simple', '"fat cat OR rat"'); + websearch_to_tsquery +------------------------------------ + 'fat' <-> 'cat' <-> 'or' <-> 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat (cat OR rat'); + websearch_to_tsquery +----------------------- + 'fat' & 'cat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'or OR or'); + websearch_to_tsquery +---------------------- + 'or' | 'or' +(1 row) + +-- OR is an operator here ... +select websearch_to_tsquery('simple', '"fat cat"or"fat rat"'); + websearch_to_tsquery +----------------------------------- + 'fat' <-> 'cat' | 'fat' <-> 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat or(rat'); + websearch_to_tsquery +---------------------- + 'fat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat or)rat'); + websearch_to_tsquery +---------------------- + 'fat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat or&rat'); + websearch_to_tsquery +---------------------- + 'fat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat or|rat'); + websearch_to_tsquery +---------------------- + 'fat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat or!rat'); + websearch_to_tsquery +---------------------- + 'fat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat or<rat'); + websearch_to_tsquery +---------------------- + 'fat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat or>rat'); + websearch_to_tsquery +---------------------- + 'fat' | 'rat' +(1 row) + +select websearch_to_tsquery('simple', 'fat or '); + websearch_to_tsquery +---------------------- + 'fat' +(1 row) + +-- ... but not here +select websearch_to_tsquery('simple', 'abc orange'); + websearch_to_tsquery +---------------------- + 'abc' & 'orange' +(1 row) + +select websearch_to_tsquery('simple', 'abc orÑеÑÑ'); + websearch_to_tsquery +---------------------- + 'abc' & 'orÑеÑÑ' +(1 row) + +select websearch_to_tsquery('simple', 'abc OR1234'); + websearch_to_tsquery +---------------------- + 'abc' & 'or1234' +(1 row) + +select websearch_to_tsquery('simple', 'abc or-abc'); + websearch_to_tsquery +--------------------------------- + 'abc' & 'or-abc' & 'or' & 'abc' +(1 row) + +select websearch_to_tsquery('simple', 'abc OR_abc'); + websearch_to_tsquery +---------------------- + 'abc' & 'or' & 'abc' +(1 row) + +select websearch_to_tsquery('simple', 'abc or'); + websearch_to_tsquery +---------------------- + 'abc' & 'or' +(1 row) + +-- test quotes +select websearch_to_tsquery('english', '"pg_class pg'); + websearch_to_tsquery +----------------------- + 'pg' & 'class' & 'pg' +(1 row) + +select websearch_to_tsquery('english', 'pg_class pg"'); + websearch_to_tsquery +----------------------- + 'pg' & 'class' & 'pg' +(1 row) + +select websearch_to_tsquery('english', '"pg_class pg"'); + websearch_to_tsquery +----------------------------- + ( 'pg' & 'class' ) <-> 'pg' +(1 row) + +select websearch_to_tsquery('english', 'abc "pg_class pg"'); + websearch_to_tsquery +------------------------------------- + 'abc' & ( 'pg' & 'class' ) <-> 'pg' +(1 row) + +select websearch_to_tsquery('english', '"pg_class pg" def'); + websearch_to_tsquery +------------------------------------- + ( 'pg' & 'class' ) <-> 'pg' & 'def' +(1 row) + +select websearch_to_tsquery('english', 'abc "pg pg_class pg" def'); + websearch_to_tsquery +------------------------------------------------------ + 'abc' & 'pg' <-> ( 'pg' & 'class' ) <-> 'pg' & 'def' +(1 row) + +select websearch_to_tsquery('english', ' or "pg pg_class pg" or '); + websearch_to_tsquery +-------------------------------------- + 'pg' <-> ( 'pg' & 'class' ) <-> 'pg' +(1 row) + +select websearch_to_tsquery('english', '""pg pg_class pg""'); + websearch_to_tsquery +------------------------------ + 'pg' & 'pg' & 'class' & 'pg' +(1 row) + +select websearch_to_tsquery('english', 'abc """"" def'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('english', 'cat -"fat rat"'); + websearch_to_tsquery +------------------------------ + 'cat' & !( 'fat' <-> 'rat' ) +(1 row) + +select websearch_to_tsquery('english', 'cat -"fat rat" cheese'); + websearch_to_tsquery +---------------------------------------- + 'cat' & !( 'fat' <-> 'rat' ) & 'chees' +(1 row) + +select websearch_to_tsquery('english', 'abc "def -"'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('english', 'abc "def :"'); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('english', '"A fat cat" has just eaten a -rat.'); + websearch_to_tsquery +------------------------------------ + 'fat' <-> 'cat' & 'eaten' & !'rat' +(1 row) + +select websearch_to_tsquery('english', '"A fat cat" has just eaten OR !rat.'); + websearch_to_tsquery +----------------------------------- + 'fat' <-> 'cat' & 'eaten' | 'rat' +(1 row) + +select websearch_to_tsquery('english', '"A fat cat" has just (+eaten OR -rat)'); + websearch_to_tsquery +------------------------------------ + 'fat' <-> 'cat' & 'eaten' | !'rat' +(1 row) + +select websearch_to_tsquery('english', 'this is ----fine'); + websearch_to_tsquery +---------------------- + !!!!'fine' +(1 row) + +select websearch_to_tsquery('english', '(()) )))) this ||| is && -fine, "dear friend" OR good'); + websearch_to_tsquery +---------------------------------------- + !'fine' & 'dear' <-> 'friend' | 'good' +(1 row) + +select websearch_to_tsquery('english', 'an old <-> cat " is fine &&& too'); + websearch_to_tsquery +------------------------ + 'old' & 'cat' & 'fine' +(1 row) + +select websearch_to_tsquery('english', '"A the" OR just on'); +NOTICE: text-search query contains only stop words or doesn't contain lexemes, ignored + websearch_to_tsquery +---------------------- + +(1 row) + +select websearch_to_tsquery('russian', '"ÑолÑÑÐ°Ñ ÐºÐ¾Ñка" ÑÑела кÑÑÑÑ'); + websearch_to_tsquery +-------------------------------------- + 'ÑолÑÑ' <-> 'коÑк' & 'ÑÑел' & 'кÑÑÑ' +(1 row) + +select to_tsvector('russian', 'ÑÑела ÑолÑÑÐ°Ñ ÐºÐ¾Ñка кÑÑÑÑ') @@ +websearch_to_tsquery('russian', '"ÑолÑÑÐ°Ñ ÐºÐ¾Ñка" ÑÑела кÑÑÑÑ'); + ?column? +---------- + t +(1 row) + +select to_tsvector('russian', 'ÑÑела ÑолÑÑÐ°Ñ ÑеÑÐ°Ñ ÐºÐ¾Ñка кÑÑÑÑ') @@ +websearch_to_tsquery('russian', '"ÑолÑÑÐ°Ñ ÐºÐ¾Ñка" ÑÑела кÑÑÑÑ'); + ?column? +---------- + f +(1 row) + +-- cases handled by gettoken_tsvector() +select websearch_to_tsquery(''''); +NOTICE: text-search query contains only stop words or doesn't contain lexemes, ignored + websearch_to_tsquery +---------------------- + +(1 row) + +select websearch_to_tsquery('''abc''''def'''); + websearch_to_tsquery +---------------------- + 'abc' & 'def' +(1 row) + +select websearch_to_tsquery('\abc'); + websearch_to_tsquery +---------------------- + 'abc' +(1 row) + +select websearch_to_tsquery('\'); +NOTICE: text-search query contains only stop words or doesn't contain lexemes, ignored + websearch_to_tsquery +---------------------- + +(1 row) + diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql index 1c8520b3e9..f299a5d32b 100644 --- a/src/test/regress/sql/tsearch.sql +++ b/src/test/regress/sql/tsearch.sql @@ -539,3 +539,98 @@ create index phrase_index_test_idx on phrase_index_test using gin(fts); set enable_seqscan = off; select * from phrase_index_test where fts @@ phraseto_tsquery('english', 'fat cat'); set enable_seqscan = on; + +-- test websearch_to_tsquery function +select websearch_to_tsquery('simple', 'I have a fat:*ABCD cat'); +select websearch_to_tsquery('simple', 'orange:**AABBCCDD'); +select websearch_to_tsquery('simple', 'fat:A!cat:B|rat:C<'); +select websearch_to_tsquery('simple', 'fat:A : cat:B'); + +select websearch_to_tsquery('simple', 'fat*rat'); +select websearch_to_tsquery('simple', 'fat-rat'); +select websearch_to_tsquery('simple', 'fat_rat'); + +-- weights are completely ignored +select websearch_to_tsquery('simple', 'abc : def'); +select websearch_to_tsquery('simple', 'abc:def'); +select websearch_to_tsquery('simple', 'a:::b'); +select websearch_to_tsquery('simple', 'abc:d'); +select websearch_to_tsquery('simple', ':'); + +-- these operators are ignored +select websearch_to_tsquery('simple', 'abc & def'); +select websearch_to_tsquery('simple', 'abc | def'); +select websearch_to_tsquery('simple', 'abc <-> def'); +select websearch_to_tsquery('simple', 'abc (pg or class)'); + +-- NOT is ignored in quotes +select websearch_to_tsquery('english', 'My brand new smartphone'); +select websearch_to_tsquery('english', 'My brand "new smartphone"'); +select websearch_to_tsquery('english', 'My brand "new -smartphone"'); + +-- test OR operator +select websearch_to_tsquery('simple', 'cat or rat'); +select websearch_to_tsquery('simple', 'cat OR rat'); +select websearch_to_tsquery('simple', 'cat "OR" rat'); +select websearch_to_tsquery('simple', 'cat OR'); +select websearch_to_tsquery('simple', 'OR rat'); +select websearch_to_tsquery('simple', '"fat cat OR rat"'); +select websearch_to_tsquery('simple', 'fat (cat OR rat'); +select websearch_to_tsquery('simple', 'or OR or'); + +-- OR is an operator here ... +select websearch_to_tsquery('simple', '"fat cat"or"fat rat"'); +select websearch_to_tsquery('simple', 'fat or(rat'); +select websearch_to_tsquery('simple', 'fat or)rat'); +select websearch_to_tsquery('simple', 'fat or&rat'); +select websearch_to_tsquery('simple', 'fat or|rat'); +select websearch_to_tsquery('simple', 'fat or!rat'); +select websearch_to_tsquery('simple', 'fat or<rat'); +select websearch_to_tsquery('simple', 'fat or>rat'); +select websearch_to_tsquery('simple', 'fat or '); + +-- ... but not here +select websearch_to_tsquery('simple', 'abc orange'); +select websearch_to_tsquery('simple', 'abc orÑеÑÑ'); +select websearch_to_tsquery('simple', 'abc OR1234'); +select websearch_to_tsquery('simple', 'abc or-abc'); +select websearch_to_tsquery('simple', 'abc OR_abc'); +select websearch_to_tsquery('simple', 'abc or'); + +-- test quotes +select websearch_to_tsquery('english', '"pg_class pg'); +select websearch_to_tsquery('english', 'pg_class pg"'); +select websearch_to_tsquery('english', '"pg_class pg"'); +select websearch_to_tsquery('english', 'abc "pg_class pg"'); +select websearch_to_tsquery('english', '"pg_class pg" def'); +select websearch_to_tsquery('english', 'abc "pg pg_class pg" def'); +select websearch_to_tsquery('english', ' or "pg pg_class pg" or '); +select websearch_to_tsquery('english', '""pg pg_class pg""'); +select websearch_to_tsquery('english', 'abc """"" def'); +select websearch_to_tsquery('english', 'cat -"fat rat"'); +select websearch_to_tsquery('english', 'cat -"fat rat" cheese'); +select websearch_to_tsquery('english', 'abc "def -"'); +select websearch_to_tsquery('english', 'abc "def :"'); + +select websearch_to_tsquery('english', '"A fat cat" has just eaten a -rat.'); +select websearch_to_tsquery('english', '"A fat cat" has just eaten OR !rat.'); +select websearch_to_tsquery('english', '"A fat cat" has just (+eaten OR -rat)'); + +select websearch_to_tsquery('english', 'this is ----fine'); +select websearch_to_tsquery('english', '(()) )))) this ||| is && -fine, "dear friend" OR good'); +select websearch_to_tsquery('english', 'an old <-> cat " is fine &&& too'); + +select websearch_to_tsquery('english', '"A the" OR just on'); +select websearch_to_tsquery('russian', '"ÑолÑÑÐ°Ñ ÐºÐ¾Ñка" ÑÑела кÑÑÑÑ'); + +select to_tsvector('russian', 'ÑÑела ÑолÑÑÐ°Ñ ÐºÐ¾Ñка кÑÑÑÑ') @@ +websearch_to_tsquery('russian', '"ÑолÑÑÐ°Ñ ÐºÐ¾Ñка" ÑÑела кÑÑÑÑ'); + +select to_tsvector('russian', 'ÑÑела ÑолÑÑÐ°Ñ ÑеÑÐ°Ñ ÐºÐ¾Ñка кÑÑÑÑ') @@ +websearch_to_tsquery('russian', '"ÑолÑÑÐ°Ñ ÐºÐ¾Ñка" ÑÑела кÑÑÑÑ'); + +-- cases handled by gettoken_tsvector() +select websearch_to_tsquery(''''); +select websearch_to_tsquery('''abc''''def'''); +select websearch_to_tsquery('\abc'); +select websearch_to_tsquery('\');