Andres Freund <and...@anarazel.de> writes:

Hi Andres,

Thank you for review of my patch.

> Some points:
> - Patch looks generally sound
> - lacks a bit of a motivational statement, even though one can imagine uses

The patch has initially been motivated by the request in pgsql-general
(http://archives.postgresql.org/pgsql-general/2009-02/msg00102.php).

> - Imho mode=MAP should error out if keeporig is false
> - I personally find the the names for the different modes a bit 
> nondescriptive.
>   One possibility would be to introduce parameters like:
>       - matchorig
>       - matchsynonym
>       - keeporig
>       - keepsynonym
> That sounds way much easier to grasp for me.

Yes, I agree. In such a way user has the complete (and more straightforward)
control over the dictionary behaviour.

Here is the revised patch version, with following options:

     * matchorig controls whether the original word is accepted by the
       dictionary. Default is true.

     * keeporig controls whether the original word is included (if true)
       in results, or only its synonyms (if false). Default is true.

     * matchsynonyms controls whether any of the synonyms is accepted by
       the dictionary (if true). Default is false.

     * keepsynonyms controls whether synonyms are returned by the
       dictionary (if true). Default is true.

Defaults are set to keep default behaviour compatible with original version.

Thanks,
Sergey

Index: contrib/dict_xsyn/dict_xsyn.c
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/dict_xsyn/dict_xsyn.c,v
retrieving revision 1.6
diff -u -r1.6 dict_xsyn.c
--- contrib/dict_xsyn/dict_xsyn.c	1 Jan 2009 17:23:32 -0000	1.6
+++ contrib/dict_xsyn/dict_xsyn.c	27 Jul 2009 09:51:52 -0000
@@ -26,6 +26,7 @@
 	char	   *key;			/* Word */
 	char	   *value;			/* Unparsed list of synonyms, including the
 								 * word itself */
+	int         pos;            /* Position of key word in original string */
 } Syn;
 
 typedef struct
@@ -33,7 +34,11 @@
 	int			len;
 	Syn		   *syn;
 
+	bool		matchorig;
 	bool		keeporig;
+	bool		matchsynonyms;
+	bool		keepsynonyms;
+	
 } DictSyn;
 
 
@@ -88,6 +93,7 @@
 	{
 		char	   *value;
 		char	   *key;
+		char       *pos;
 		char	   *end = NULL;
 
 		if (*line == '\0')
@@ -96,26 +102,39 @@
 		value = lowerstr(line);
 		pfree(line);
 
-		key = find_word(value, &end);
-		if (!key)
-		{
-			pfree(value);
-			continue;
-		}
+		pos = value;
 
-		if (cur == d->len)
+		while((key = find_word(pos, &end)) != NULL)
 		{
-			d->len = (d->len > 0) ? 2 * d->len : 16;
-			if (d->syn)
-				d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
-			else
-				d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
-		}
+			if (cur == d->len)
+			{
+				d->len = (d->len > 0) ? 2 * d->len : 16;
+				if (d->syn)
+					d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
+				else
+					d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
+			}
 
-		d->syn[cur].key = pnstrdup(key, end - key);
-		d->syn[cur].value = value;
+			/* Read first word only if we will match it */
+			if (pos != value || d->matchorig)
+			{
+				d->syn[cur].key = pnstrdup(key, end - key);
+				d->syn[cur].value = pstrdup(value);
+				d->syn[cur].pos = key - value;
+			
+				cur++;
+			}
 
-		cur++;
+			pos = end;
+
+			/* Don't read synonyms if we do not match them */
+			if (!d->matchsynonyms)
+			{
+				break;
+			}
+		}
+
+		pfree(value);
 	}
 
 	tsearch_readline_end(&trst);
@@ -133,23 +152,40 @@
 	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
 	DictSyn    *d;
 	ListCell   *l;
-
+	char       *filename = NULL;
+	
 	d = (DictSyn *) palloc0(sizeof(DictSyn));
 	d->len = 0;
 	d->syn = NULL;
+	d->matchorig = true;
 	d->keeporig = true;
-
+	d->matchsynonyms = false;
+	d->keepsynonyms = true;
+	
 	foreach(l, dictoptions)
 	{
 		DefElem    *defel = (DefElem *) lfirst(l);
 
-		if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0)
+		if (pg_strcasecmp(defel->defname, "MATCHORIG") == 0)
+		{
+			d->matchorig = defGetBoolean(defel);
+		}
+		else if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0)
 		{
 			d->keeporig = defGetBoolean(defel);
 		}
+		else if (pg_strcasecmp(defel->defname, "MATCHSYNONYMS") == 0)
+		{
+			d->matchsynonyms = defGetBoolean(defel);
+		}
+		else if (pg_strcasecmp(defel->defname, "KEEPSYNONYMS") == 0)
+		{
+			d->keepsynonyms = defGetBoolean(defel);
+		}
 		else if (pg_strcasecmp(defel->defname, "RULES") == 0)
 		{
-			read_dictionary(d, defGetString(defel));
+			/* we can't read the rules before parsing all options! */
+			filename = pstrdup(defGetString(defel));
 		}
 		else
 		{
@@ -160,6 +196,12 @@
 		}
 	}
 
+	if(filename)
+	{
+		read_dictionary(d, filename);
+		pfree(filename);
+	}
+	
 	PG_RETURN_POINTER(d);
 }
 
@@ -198,7 +240,6 @@
 		int			value_length = strlen(value);
 		char	   *pos = value;
 		int			nsyns = 0;
-		bool		is_first = true;
 
 		res = palloc(0);
 
@@ -214,8 +255,8 @@
 			res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2));
 			res[nsyns].lexeme = NULL;
 
-			/* first word is added to result only if KEEPORIG flag is set */
-			if (d->keeporig || !is_first)
+			/* The first word is added only if keeporig=true */
+			if (pos != value || d->keeporig)
 			{
 				res[nsyns].lexeme = pstrdup(syn);
 				res[nsyns + 1].lexeme = NULL;
@@ -223,9 +264,12 @@
 				nsyns++;
 			}
 
-			is_first = false;
-
 			pos = end + 1;
+
+			if(!d->keepsynonyms)
+			{
+				break;
+			}
 		}
 
 		pfree(value);
Index: contrib/dict_xsyn/expected/dict_xsyn.out
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/dict_xsyn/expected/dict_xsyn.out,v
retrieving revision 1.1
diff -u -r1.1 dict_xsyn.out
--- contrib/dict_xsyn/expected/dict_xsyn.out	15 Oct 2007 21:36:50 -0000	1.1
+++ contrib/dict_xsyn/expected/dict_xsyn.out	27 Jul 2009 09:51:53 -0000
@@ -5,10 +5,76 @@
 SET client_min_messages = warning;
 \set ECHO none
 RESET client_min_messages;
---configuration
-ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
+-- default configuration - match first word and return it among with all synonyms
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
 --lexize
 SELECT ts_lexize('xsyn', 'supernova');
+        ts_lexize         
+--------------------------
+ {supernova,sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
+-- the same, but return only synonyms
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
+SELECT ts_lexize('xsyn', 'supernova');
+   ts_lexize    
+----------------
+ {sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
+-- match any word and return all words
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+        ts_lexize         
+--------------------------
+ {supernova,sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+        ts_lexize         
+--------------------------
+ {supernova,sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
+-- match any word and return all words except first one
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+   ts_lexize    
+----------------
+ {sn,sne,1987a}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
    ts_lexize    
 ----------------
  {sn,sne,1987a}
@@ -20,3 +86,63 @@
  
 (1 row)
 
+-- match any synonym but not first word, and return first word instead
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+  ts_lexize  
+-------------
+ {supernova}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
+-- do not match or return anything
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false);
+SELECT ts_lexize('xsyn', 'supernova');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
+-- match any word but return nothing
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+ ts_lexize 
+-----------
+ {}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'sn');
+ ts_lexize 
+-----------
+ {}
+(1 row)
+
+SELECT ts_lexize('xsyn', 'grb');
+ ts_lexize 
+-----------
+ 
+(1 row)
+
Index: contrib/dict_xsyn/sql/dict_xsyn.sql
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/dict_xsyn/sql/dict_xsyn.sql,v
retrieving revision 1.1
diff -u -r1.1 dict_xsyn.sql
--- contrib/dict_xsyn/sql/dict_xsyn.sql	15 Oct 2007 21:36:50 -0000	1.1
+++ contrib/dict_xsyn/sql/dict_xsyn.sql	27 Jul 2009 09:51:53 -0000
@@ -8,9 +8,47 @@
 \set ECHO all
 RESET client_min_messages;
 
---configuration
-ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
+-- default configuration - match first word and return it among with all synonyms
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
 
 --lexize
 SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
 SELECT ts_lexize('xsyn', 'grb');
+
+-- the same, but return only synonyms
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- match any word and return all words
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- match any word and return all words except first one
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- match any synonym but not first word, and return first word instead
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- do not match or return anything
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
+-- match any word but return nothing
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
+SELECT ts_lexize('xsyn', 'supernova');
+SELECT ts_lexize('xsyn', 'sn');
+SELECT ts_lexize('xsyn', 'grb');
+
Index: doc/src/sgml/dict-xsyn.sgml
===================================================================
RCS file: /projects/cvsroot/pgsql/doc/src/sgml/dict-xsyn.sgml,v
retrieving revision 1.2
diff -u -r1.2 dict-xsyn.sgml
--- doc/src/sgml/dict-xsyn.sgml	6 Dec 2007 04:12:10 -0000	1.2
+++ doc/src/sgml/dict-xsyn.sgml	27 Jul 2009 09:51:56 -0000
@@ -23,13 +23,32 @@
   <itemizedlist>
    <listitem>
     <para>
+     <literal>matchorig</> controls whether the original word is accepted by
+     the dictionary. Default is <literal>true</>.
+    </para>
+   </listitem>
+   <listitem>
+    <para>
      <literal>keeporig</> controls whether the original word is included (if
-     <literal>true</>), or only its synonyms (if <literal>false</>). Default
-     is <literal>true</>.
+     <literal>true</>) in results, or only its synonyms (if
+     <literal>false</>). Default is <literal>true</>.
+    </para>
+   </listitem>
+   <listitem>
+    <para>
+     <literal>matchsynonyms</> controls whether any of the synonyms is accepted
+     by the dictionary (if <literal>true</>). Default is <literal>false</>.
     </para>
    </listitem>
    <listitem>
     <para>
+     <literal>keepsynonyms</> controls whether synonyms are returned by the
+     dictionary (if <literal>true</>). Default is <literal>true</>.
+    </para>
+   </listitem>
+
+   <listitem>
+    <para>
      <literal>rules</> is the base name of the file containing the list of
      synonyms.  This file must be stored in
      <filename>$SHAREDIR/tsearch_data/</> (where <literal>$SHAREDIR</> means
@@ -90,7 +109,31 @@
 mydb=# SELECT ts_lexize('xsyn', 'word');
       ts_lexize
 -----------------------
+ {syn1,syn2,syn3}
+
+mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true);
+ALTER TEXT SEARCH DICTIONARY
+
+mydb=# SELECT ts_lexize('xsyn', 'word');
+      ts_lexize
+-----------------------
  {word,syn1,syn2,syn3}
+
+mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=false, MATCHSYNONYMS=true);
+ALTER TEXT SEARCH DICTIONARY
+
+mydb=# SELECT ts_lexize('xsyn', 'syn1');
+      ts_lexize
+-----------------------
+ {syn1,syn2,syn3}
+
+mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false);
+ALTER TEXT SEARCH DICTIONARY
+
+mydb=# SELECT ts_lexize('xsyn', 'syn1');
+      ts_lexize
+-----------------------
+ {word}
 </programlisting>
 
    but real-world usage will involve including it in a text search
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to