Andres Freund <and...@anarazel.de> writes: Hi Andres,
Thank you for review of my patch. > Some points: > - Patch looks generally sound > - lacks a bit of a motivational statement, even though one can imagine uses The patch has initially been motivated by the request in pgsql-general (http://archives.postgresql.org/pgsql-general/2009-02/msg00102.php). > - Imho mode=MAP should error out if keeporig is false > - I personally find the the names for the different modes a bit > nondescriptive. > One possibility would be to introduce parameters like: > - matchorig > - matchsynonym > - keeporig > - keepsynonym > That sounds way much easier to grasp for me. Yes, I agree. In such a way user has the complete (and more straightforward) control over the dictionary behaviour. Here is the revised patch version, with following options: * matchorig controls whether the original word is accepted by the dictionary. Default is true. * keeporig controls whether the original word is included (if true) in results, or only its synonyms (if false). Default is true. * matchsynonyms controls whether any of the synonyms is accepted by the dictionary (if true). Default is false. * keepsynonyms controls whether synonyms are returned by the dictionary (if true). Default is true. Defaults are set to keep default behaviour compatible with original version. Thanks, Sergey
Index: contrib/dict_xsyn/dict_xsyn.c =================================================================== RCS file: /projects/cvsroot/pgsql/contrib/dict_xsyn/dict_xsyn.c,v retrieving revision 1.6 diff -u -r1.6 dict_xsyn.c --- contrib/dict_xsyn/dict_xsyn.c 1 Jan 2009 17:23:32 -0000 1.6 +++ contrib/dict_xsyn/dict_xsyn.c 27 Jul 2009 09:51:52 -0000 @@ -26,6 +26,7 @@ char *key; /* Word */ char *value; /* Unparsed list of synonyms, including the * word itself */ + int pos; /* Position of key word in original string */ } Syn; typedef struct @@ -33,7 +34,11 @@ int len; Syn *syn; + bool matchorig; bool keeporig; + bool matchsynonyms; + bool keepsynonyms; + } DictSyn; @@ -88,6 +93,7 @@ { char *value; char *key; + char *pos; char *end = NULL; if (*line == '\0') @@ -96,26 +102,39 @@ value = lowerstr(line); pfree(line); - key = find_word(value, &end); - if (!key) - { - pfree(value); - continue; - } + pos = value; - if (cur == d->len) + while((key = find_word(pos, &end)) != NULL) { - d->len = (d->len > 0) ? 2 * d->len : 16; - if (d->syn) - d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len); - else - d->syn = (Syn *) palloc(sizeof(Syn) * d->len); - } + if (cur == d->len) + { + d->len = (d->len > 0) ? 2 * d->len : 16; + if (d->syn) + d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len); + else + d->syn = (Syn *) palloc(sizeof(Syn) * d->len); + } - d->syn[cur].key = pnstrdup(key, end - key); - d->syn[cur].value = value; + /* Read first word only if we will match it */ + if (pos != value || d->matchorig) + { + d->syn[cur].key = pnstrdup(key, end - key); + d->syn[cur].value = pstrdup(value); + d->syn[cur].pos = key - value; + + cur++; + } - cur++; + pos = end; + + /* Don't read synonyms if we do not match them */ + if (!d->matchsynonyms) + { + break; + } + } + + pfree(value); } tsearch_readline_end(&trst); @@ -133,23 +152,40 @@ List *dictoptions = (List *) PG_GETARG_POINTER(0); DictSyn *d; ListCell *l; - + char *filename = NULL; + d = (DictSyn *) palloc0(sizeof(DictSyn)); d->len = 0; d->syn = NULL; + d->matchorig = true; d->keeporig = true; - + d->matchsynonyms = false; + d->keepsynonyms = true; + foreach(l, dictoptions) { DefElem *defel = (DefElem *) lfirst(l); - if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0) + if (pg_strcasecmp(defel->defname, "MATCHORIG") == 0) + { + d->matchorig = defGetBoolean(defel); + } + else if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0) { d->keeporig = defGetBoolean(defel); } + else if (pg_strcasecmp(defel->defname, "MATCHSYNONYMS") == 0) + { + d->matchsynonyms = defGetBoolean(defel); + } + else if (pg_strcasecmp(defel->defname, "KEEPSYNONYMS") == 0) + { + d->keepsynonyms = defGetBoolean(defel); + } else if (pg_strcasecmp(defel->defname, "RULES") == 0) { - read_dictionary(d, defGetString(defel)); + /* we can't read the rules before parsing all options! */ + filename = pstrdup(defGetString(defel)); } else { @@ -160,6 +196,12 @@ } } + if(filename) + { + read_dictionary(d, filename); + pfree(filename); + } + PG_RETURN_POINTER(d); } @@ -198,7 +240,6 @@ int value_length = strlen(value); char *pos = value; int nsyns = 0; - bool is_first = true; res = palloc(0); @@ -214,8 +255,8 @@ res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2)); res[nsyns].lexeme = NULL; - /* first word is added to result only if KEEPORIG flag is set */ - if (d->keeporig || !is_first) + /* The first word is added only if keeporig=true */ + if (pos != value || d->keeporig) { res[nsyns].lexeme = pstrdup(syn); res[nsyns + 1].lexeme = NULL; @@ -223,9 +264,12 @@ nsyns++; } - is_first = false; - pos = end + 1; + + if(!d->keepsynonyms) + { + break; + } } pfree(value); Index: contrib/dict_xsyn/expected/dict_xsyn.out =================================================================== RCS file: /projects/cvsroot/pgsql/contrib/dict_xsyn/expected/dict_xsyn.out,v retrieving revision 1.1 diff -u -r1.1 dict_xsyn.out --- contrib/dict_xsyn/expected/dict_xsyn.out 15 Oct 2007 21:36:50 -0000 1.1 +++ contrib/dict_xsyn/expected/dict_xsyn.out 27 Jul 2009 09:51:53 -0000 @@ -5,10 +5,76 @@ SET client_min_messages = warning; \set ECHO none RESET client_min_messages; ---configuration -ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false); +-- default configuration - match first word and return it among with all synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); --lexize SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +-------------------------- + {supernova,sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- the same, but return only synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +---------------- + {sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- match any word and return all words +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +-------------------------- + {supernova,sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +-------------------------- + {supernova,sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- match any word and return all words except first one +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +---------------- + {sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); ts_lexize ---------------- {sn,sne,1987a} @@ -20,3 +86,63 @@ (1 row) +-- match any synonym but not first word, and return first word instead +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +------------- + {supernova} +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- do not match or return anything +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- match any word but return nothing +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +----------- + {} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + {} +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + Index: contrib/dict_xsyn/sql/dict_xsyn.sql =================================================================== RCS file: /projects/cvsroot/pgsql/contrib/dict_xsyn/sql/dict_xsyn.sql,v retrieving revision 1.1 diff -u -r1.1 dict_xsyn.sql --- contrib/dict_xsyn/sql/dict_xsyn.sql 15 Oct 2007 21:36:50 -0000 1.1 +++ contrib/dict_xsyn/sql/dict_xsyn.sql 27 Jul 2009 09:51:53 -0000 @@ -8,9 +8,47 @@ \set ECHO all RESET client_min_messages; ---configuration -ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false); +-- default configuration - match first word and return it among with all synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); --lexize SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); SELECT ts_lexize('xsyn', 'grb'); + +-- the same, but return only synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any word and return all words +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any word and return all words except first one +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any synonym but not first word, and return first word instead +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- do not match or return anything +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any word but return nothing +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + Index: doc/src/sgml/dict-xsyn.sgml =================================================================== RCS file: /projects/cvsroot/pgsql/doc/src/sgml/dict-xsyn.sgml,v retrieving revision 1.2 diff -u -r1.2 dict-xsyn.sgml --- doc/src/sgml/dict-xsyn.sgml 6 Dec 2007 04:12:10 -0000 1.2 +++ doc/src/sgml/dict-xsyn.sgml 27 Jul 2009 09:51:56 -0000 @@ -23,13 +23,32 @@ <itemizedlist> <listitem> <para> + <literal>matchorig</> controls whether the original word is accepted by + the dictionary. Default is <literal>true</>. + </para> + </listitem> + <listitem> + <para> <literal>keeporig</> controls whether the original word is included (if - <literal>true</>), or only its synonyms (if <literal>false</>). Default - is <literal>true</>. + <literal>true</>) in results, or only its synonyms (if + <literal>false</>). Default is <literal>true</>. + </para> + </listitem> + <listitem> + <para> + <literal>matchsynonyms</> controls whether any of the synonyms is accepted + by the dictionary (if <literal>true</>). Default is <literal>false</>. </para> </listitem> <listitem> <para> + <literal>keepsynonyms</> controls whether synonyms are returned by the + dictionary (if <literal>true</>). Default is <literal>true</>. + </para> + </listitem> + + <listitem> + <para> <literal>rules</> is the base name of the file containing the list of synonyms. This file must be stored in <filename>$SHAREDIR/tsearch_data/</> (where <literal>$SHAREDIR</> means @@ -90,7 +109,31 @@ mydb=# SELECT ts_lexize('xsyn', 'word'); ts_lexize ----------------------- + {syn1,syn2,syn3} + +mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true); +ALTER TEXT SEARCH DICTIONARY + +mydb=# SELECT ts_lexize('xsyn', 'word'); + ts_lexize +----------------------- {word,syn1,syn2,syn3} + +mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=false, MATCHSYNONYMS=true); +ALTER TEXT SEARCH DICTIONARY + +mydb=# SELECT ts_lexize('xsyn', 'syn1'); + ts_lexize +----------------------- + {syn1,syn2,syn3} + +mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false); +ALTER TEXT SEARCH DICTIONARY + +mydb=# SELECT ts_lexize('xsyn', 'syn1'); + ts_lexize +----------------------- + {word} </programlisting> but real-world usage will involve including it in a text search
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers