Hackers,

Here is first version of patch, which enable index support of wildcard
search in pg_trgm contrib module. The idea of the patch is to extract from
wildcard trigrams which should occurs in wildcard matching string. For
example, for '%sector%' wildcard such trigrams would be: 'sec', 'ect',
'tor'.

create table words (word text);
copy words from '/usr/share/dict/american-english';

test=# explain analyze select * from words where word ilike '%independ%';
                                              QUERY PLAN

------------------------------------------------------------------------------------------------------
 Seq Scan on words  (cost=0.00..1703.11 rows=10 width=9) (actual
time=18.818..174.146 rows=7 loops=1)
   Filter: (word ~~* '%independ%'::text)
 Total runtime: 174.200 ms
(3 rows)

CREATE INDEX trgm_idx ON words USING gist (word gist_trgm_ops);

test=# explain analyze select * from words where word ilike '%independ%';
                                                    QUERY PLAN


------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on words  (cost=4.36..40.11 rows=10 width=9) (actual
time=2.445..2.529 rows=7 loops=1)
   Recheck Cond: (word ~~* '%independ%'::text)
   ->  Bitmap Index Scan on trgm_idx  (cost=0.00..4.35 rows=10 width=0)
(actual time=2.406..2.406 rows=7 loops=1)
         Index Cond: (word ~~* '%independ%'::text)
 Total runtime: 2.612 ms
(5 rows)

CREATE INDEX trgm_idx ON words USING gin (word gin_trgm_ops);

test=# explain analyze select * from words where word ilike '%independ%';
                                                    QUERY PLAN


-------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on words  (cost=76.08..111.83 rows=10 width=9) (actual
time=2.675..2.755 rows=7 loops=1)
   Recheck Cond: (word ~~* '%independ%'::text)
   ->  Bitmap Index Scan on trgm_idx  (cost=0.00..76.07 rows=10 width=0)
(actual time=2.642..2.642 rows=7 loops=1)
         Index Cond: (word ~~* '%independ%'::text)
 Total runtime: 2.839 ms
(5 rows)

I've encountered with following problems:
1) Indexing support for ilike is possible only with case-insensetive
wildcards, e.g. when IGNORECASE macro is enabled. But I can't use this macro
in pg_trgm.sql.in, where list of operators is defined. Probably, is it
enuogh to put comment near IGNORECASE, which tells that if one disable this
macro he should also remove oparators from pg_trgm.sql.in?
2) I found gist index not very useful with default SIGLENINT = 3. I've
changed this value to 15 and I found gist index performs very good on
dictionary. But on longer strings greater values of SIGLENINT may be
required (probably even SIGLENINT > 122 will give benefit in some cases in
spite of TOAST).

----
With best regards,
Alexander Korotkov.
*** a/contrib/pg_trgm/pg_trgm.sql.in
--- b/contrib/pg_trgm/pg_trgm.sql.in
***************
*** 113,118 **** FOR TYPE text USING gist
--- 113,120 ----
  AS
          OPERATOR        1       % (text, text),
          OPERATOR        2       <-> (text, text) FOR ORDER BY pg_catalog.float_ops,
+         OPERATOR        3       ~~ (text, text),
+         OPERATOR        4       ~~* (text, text),
          FUNCTION        1       gtrgm_consistent (internal, text, int, oid, internal),
          FUNCTION        2       gtrgm_union (bytea, internal),
          FUNCTION        3       gtrgm_compress (internal),
***************
*** 144,149 **** CREATE OPERATOR CLASS gin_trgm_ops
--- 146,153 ----
  FOR TYPE text USING gin
  AS
          OPERATOR        1       % (text, text),
+         OPERATOR        3       ~~ (text, text),
+         OPERATOR        4       ~~* (text, text),
          FUNCTION        1       btint4cmp (int4, int4),
          FUNCTION        2       gin_extract_trgm (text, internal),
          FUNCTION        3       gin_extract_trgm (text, internal, int2, internal, internal),
*** a/contrib/pg_trgm/trgm.h
--- b/contrib/pg_trgm/trgm.h
***************
*** 19,24 ****
--- 19,26 ----
  /* operator strategy numbers */
  #define	SimilarityStrategyNumber	1
  #define	DistanceStrategyNumber		2
+ #define LikeStrategyNumber			3
+ #define ILikeStrategyNumber			4
  
  
  typedef char trgm[3];
***************
*** 53,59 **** typedef struct
  
  /* gist */
  #define BITBYTE 8
! #define SIGLENINT  3			/* >122 => key will toast, so very slow!!! */
  #define SIGLEN	( sizeof(int)*SIGLENINT )
  
  #define SIGLENBIT (SIGLEN*BITBYTE - 1)	/* see makesign */
--- 55,61 ----
  
  /* gist */
  #define BITBYTE 8
! #define SIGLENINT  15			/* >122 => key will toast, so very slow!!! */
  #define SIGLEN	( sizeof(int)*SIGLENINT )
  
  #define SIGLENBIT (SIGLEN*BITBYTE - 1)	/* see makesign */
***************
*** 89,94 **** typedef char *BITVECP;
--- 91,101 ----
  extern float4 trgm_limit;
  
  TRGM	   *generate_trgm(char *str, int slen);
+ TRGM	   *generate_wildcard_trgm(char *str, int slen);
  float4		cnt_sml(TRGM *trg1, TRGM *trg2);
+ bool            trgm_contain(TRGM *trg1, TRGM *trg2);
+ 
+ #define ISESCAPECHAR(x) (*(x) == '\\')
+ #define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%')
  
  #endif /* __TRGM_H__ */
*** a/contrib/pg_trgm/trgm_gin.c
--- b/contrib/pg_trgm/trgm_gin.c
***************
*** 6,11 ****
--- 6,12 ----
  #include "trgm.h"
  
  #include "access/gin.h"
+ #include "access/skey.h"
  #include "access/itup.h"
  #include "access/tuptoaster.h"
  #include "storage/bufpage.h"
***************
*** 24,36 **** gin_extract_trgm(PG_FUNCTION_ARGS)
  {
  	text	   *val = (text *) PG_GETARG_TEXT_P(0);
  	int32	   *nentries = (int32 *) PG_GETARG_POINTER(1);
  	Datum	   *entries = NULL;
  	TRGM	   *trg;
  	int4		trglen;
  
  	*nentries = 0;
  
! 	trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
  	trglen = ARRNELEM(trg);
  
  	if (trglen > 0)
--- 25,56 ----
  {
  	text	   *val = (text *) PG_GETARG_TEXT_P(0);
  	int32	   *nentries = (int32 *) PG_GETARG_POINTER(1);
+ 	StrategyNumber strategy;
  	Datum	   *entries = NULL;
  	TRGM	   *trg;
  	int4		trglen;
  
+ 	if (fcinfo->nargs == 2)
+ 		strategy = SimilarityStrategyNumber;
+ 	else
+ 		strategy = PG_GETARG_UINT16(2);
+ 
  	*nentries = 0;
  
! 	switch (strategy)
! 	{
! 		case SimilarityStrategyNumber:
! 			trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
! 			break;
! 		case LikeStrategyNumber:
! 		case ILikeStrategyNumber:
! 			trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
! 			break;
! 		default:
! 			elog(ERROR, "unrecognized strategy number: %d", strategy);
! 			trg = NULL;		/* keep compiler quiet */
! 			break;
! 	}
  	trglen = ARRNELEM(trg);
  
  	if (trglen > 0)
***************
*** 71,77 **** gin_trgm_consistent(PG_FUNCTION_ARGS)
  {
  	bool	   *check = (bool *) PG_GETARG_POINTER(0);
  
! 	/* StrategyNumber strategy = PG_GETARG_UINT16(1); */
  	/* text    *query = PG_GETARG_TEXT_P(2); */
  	/* int32	nkeys = PG_GETARG_INT32(3); */
  	Pointer    *extra_data = (Pointer *) PG_GETARG_POINTER(4);
--- 91,97 ----
  {
  	bool	   *check = (bool *) PG_GETARG_POINTER(0);
  
! 	StrategyNumber strategy = PG_GETARG_UINT16(1);
  	/* text    *query = PG_GETARG_TEXT_P(2); */
  	/* int32	nkeys = PG_GETARG_INT32(3); */
  	Pointer    *extra_data = (Pointer *) PG_GETARG_POINTER(4);
***************
*** 81,100 **** gin_trgm_consistent(PG_FUNCTION_ARGS)
  				trglen,
  				ntrue = 0;
  
  	/* All cases served by this function are inexact */
  	*recheck = true;
  
  	trglen = *(int32 *) extra_data;
  
! 	for (i = 0; i < trglen; i++)
! 		if (check[i])
! 			ntrue++;
  
  #ifdef DIVUNION
! 	res = (trglen == ntrue) ? true : ((((((float4) ntrue) / ((float4) (trglen - ntrue)))) >= trgm_limit) ? true : false);
  #else
! 	res = (trglen == 0) ? false : ((((((float4) ntrue) / ((float4) trglen))) >= trgm_limit) ? true : false);
  #endif
! 
  	PG_RETURN_BOOL(res);
  }
--- 101,144 ----
  				trglen,
  				ntrue = 0;
  
+ #ifndef IGNORECASE
+ 	if (strategy == ILIKE_STRATEGY)
+ 	{
+ 		elog(ERROR, "Can't do ILIKE_STRATEGY with case-sensetive trigrams.");
+ 	}
+ #endif
  	/* All cases served by this function are inexact */
  	*recheck = true;
  
  	trglen = *(int32 *) extra_data;
  
! 	switch (strategy)
! 	{
! 		case SimilarityStrategyNumber:
! 			for (i = 0; i < trglen; i++)
! 				if (check[i])
! 					ntrue++;
  
  #ifdef DIVUNION
! 			res = (trglen == ntrue) ? true : ((((((float4) ntrue) / ((float4) (trglen - ntrue)))) >= trgm_limit) ? true : false);
  #else
! 			res = (trglen == 0) ? false : ((((((float4) ntrue) / ((float4) trglen))) >= trgm_limit) ? true : false);
  #endif
! 			break;
! 		case LikeStrategyNumber:
! 		case ILikeStrategyNumber:
! 			res = true;
! 			for (i = 0; i < trglen; i++)
! 				if (!check[i])
! 				{
! 					res = false;
! 					break;
! 				}
! 			break;
! 		default:
! 			elog(ERROR, "unrecognized strategy number: %d", strategy);
! 			res = false;		/* keep compiler quiet */
! 			break;
! 	}
  	PG_RETURN_BOOL(res);
  }
*** a/contrib/pg_trgm/trgm_gist.c
--- b/contrib/pg_trgm/trgm_gist.c
***************
*** 195,225 **** gtrgm_consistent(PG_FUNCTION_ARGS)
  	TRGM	   *key = (TRGM *) DatumGetPointer(entry->key);
  	TRGM	   *qtrg;
  	bool		res;
! 	char	   *cache = (char *) fcinfo->flinfo->fn_extra;
! 
! 	/* All cases served by this function are exact */
! 	*recheck = false;
  
! 	if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0)
  	{
! 		qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
  
  		if (cache)
  			pfree(cache);
  
  		fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
! 								   MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
  		cache = (char *) fcinfo->flinfo->fn_extra;
  
! 		memcpy(cache, query, VARSIZE(query));
! 		memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg));
  	}
  
! 	qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
  
  	switch (strategy)
  	{
  		case SimilarityStrategyNumber:
  			if (GIST_LEAF(entry))
  			{							/* all leafs contains orig trgm */
  				float4      tmpsml = cnt_sml(key, qtrg);
--- 195,248 ----
  	TRGM	   *key = (TRGM *) DatumGetPointer(entry->key);
  	TRGM	   *qtrg;
  	bool		res;
! 	char	   *cache = (char *) fcinfo->flinfo->fn_extra,
! 		       *cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
! #ifndef IGNORECASE
! 	if (strategy == ILIKE_STRATEGY)
! 	{
! 		elog(ERROR, "Can't do ILIKE_STRATEGY with case-sensetive trigrams.");
! 	}
! #endif
  
! 	if (cache == NULL || strategy != *((StrategyNumber *)cache) ||
! 		VARSIZE(cacheContents) != VARSIZE(query) ||
! 		memcmp(cacheContents, query, VARSIZE(query)) != 0)
  	{
! 		switch (strategy)
! 		{
! 			case SimilarityStrategyNumber:
! 				qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
! 				break;
! 			case LikeStrategyNumber:
! 			case ILikeStrategyNumber:
! 				qtrg = generate_wildcard_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
! 				break;
! 			default:
! 				elog(ERROR, "unrecognized strategy number: %d", strategy);
! 				qtrg = NULL;		/* keep compiler quiet */
! 				break;
! 		}
  
  		if (cache)
  			pfree(cache);
  
  		fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
! 			MAXALIGN(sizeof(StrategyNumber)) + MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
  		cache = (char *) fcinfo->flinfo->fn_extra;
+ 		cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
  
! 		memcpy(cache, &strategy, sizeof(StrategyNumber));
! 		memcpy(cacheContents, query, VARSIZE(query));
! 		memcpy(cacheContents + MAXALIGN(VARSIZE(query)),
! 			qtrg, VARSIZE(qtrg));
  	}
  
! 	qtrg = (TRGM *) (cacheContents + MAXALIGN(VARSIZE(query)));
  
  	switch (strategy)
  	{
  		case SimilarityStrategyNumber:
+ 			*recheck = false;
  			if (GIST_LEAF(entry))
  			{							/* all leafs contains orig trgm */
  				float4      tmpsml = cnt_sml(key, qtrg);
***************
*** 242,247 **** gtrgm_consistent(PG_FUNCTION_ARGS)
--- 265,298 ----
  					res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false;
  			}
  			break;
+ 		case LikeStrategyNumber:
+ 		case ILikeStrategyNumber:
+ 			*recheck = true;
+ 			if (GIST_LEAF(entry))
+ 			{							/* all leafs contains orig trgm */
+ 				res = trgm_contain(qtrg, key);
+ 			}
+ 			else if (ISALLTRUE(key))
+ 			{							/* non-leaf contains signature */
+ 				res = true;
+ 			}
+ 			else
+ 			{							/* non-leaf contains signature */
+ 				int4 k, tmp = 0, len = ARRNELEM(qtrg);
+ 				trgm *ptr = GETARR(qtrg);
+ 				BITVECP sign = GETSIGN(key);
+ 				res = true;
+ 				for (k = 0; k < len; k++)
+ 				{
+ 					CPTRGM(((char *) &tmp), ptr + k);
+ 					if (!GETBIT(sign, HASHVAL(tmp)))
+ 					{
+ 						res = false;
+ 						break;
+ 					}
+ 				}
+ 			}
+ 			break;
  		default:
  			elog(ERROR, "unrecognized strategy number: %d", strategy);
  			res = false;		/* keep compiler quiet */
*** a/contrib/pg_trgm/trgm_op.c
--- b/contrib/pg_trgm/trgm_op.c
***************
*** 236,241 **** generate_trgm(char *str, int slen)
--- 236,404 ----
  	return trg;
  }
  
+ static char *
+ get_wildcard_part(char *str, int lenstr, char *buf, int *bytelen, int *charlen)
+ {
+ 	char	   *beginword = str, *endword, *s = buf;
+ 	bool        in_wildcard = false, in_escape = false;
+ 	int         clen;
+ 
+ 	while (beginword - str < lenstr)
+ 	{
+ 		if (in_escape)
+ 		{
+ 			in_escape = false;
+ 			in_wildcard = false;
+ 			if (iswordchr(beginword)) break;
+ 		}
+ 		else
+ 		{
+ 			if (ISESCAPECHAR(beginword))
+ 				in_escape = true;
+ 			else if (ISWILDCARDCHAR(beginword))
+ 				in_wildcard = true;
+ 			else if (iswordchr(beginword))
+ 				break;
+ 			else
+ 				in_wildcard = false;
+ 		}
+ 		beginword += pg_mblen(beginword);
+ 	}
+ 	*charlen = 0;
+ 	if (!in_wildcard)
+ 	{
+ 		if (LPADDING > 0)
+ 		{
+ 			*s++ = ' ';
+ 			(*charlen)++;
+ 			if (LPADDING > 1)
+ 			{
+ 				*s++ = ' ';
+ 				(*charlen)++;
+ 			}
+ 		}
+ 	}
+ 	if (beginword - str >= lenstr)
+ 		return NULL;
+ 
+ 	endword = beginword;
+ 	in_wildcard = false;
+ 	in_escape = false;
+ 	while (endword - str < lenstr)
+ 	{
+ 		clen = pg_mblen(endword);
+ 		if (in_escape)
+ 		{
+ 			in_escape = false;
+ 			in_wildcard = false;
+ 			if (iswordchr(endword))
+ 			{
+ 				(*charlen)++;
+ 				memcpy(s, endword, clen);
+ 				s += clen;
+ 			}
+ 			else
+ 				break;
+ 		}
+ 		else
+ 		{
+ 			if (ISESCAPECHAR(endword))
+ 				in_escape = true;
+ 			else if (ISWILDCARDCHAR(endword))
+ 			{
+ 				in_wildcard = true;
+ 				break;
+ 			}
+ 			else if (iswordchr(endword))
+ 			{
+ 				(*charlen)++;
+ 				memcpy(s, endword, clen);
+ 				s += clen;
+ 			}
+ 			else
+ 			{
+ 				in_wildcard = false;
+ 				break;
+ 			}
+ 		}
+ 		endword += clen;
+ 	}
+ 	if (!in_wildcard)
+ 	{
+ 		if (RPADDING > 0)
+ 		{
+ 			*s++ = ' ';
+ 			(*charlen)++;
+ 			if (RPADDING > 1)
+ 			{
+ 				*s++ = ' ';
+ 				(*charlen)++;
+ 			}
+ 		}
+ 	}
+ 	*bytelen = s - buf;
+ 	return endword;
+ }
+ 
+ TRGM *
+ generate_wildcard_trgm(char *str, int slen)
+ {
+ 	TRGM	   *trg;
+ 	char	   *buf,
+ 		       *buf2;
+ 	trgm	   *tptr;
+ 	int			len,
+ 				charlen,
+ 				bytelen;
+ 	char	   *eword;
+ 
+ 	trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
+ 	trg->flag = ARRKEY;
+ 	SET_VARSIZE(trg, TRGMHDRSIZE);
+ 
+ 	if (slen + LPADDING + RPADDING < 3 || slen == 0)
+ 		return trg;
+ 
+ 	tptr = GETARR(trg);
+ 
+ 	buf = palloc(sizeof(char) * (slen + 4));
+ 
+ 	eword = str;
+ 	while ((eword = get_wildcard_part(eword, slen - (eword - str),
+ 		buf, &bytelen, &charlen)) != NULL)
+ 	{
+ #ifdef IGNORECASE
+ 		buf2 = lowerstr_with_len(buf, bytelen);
+ 		bytelen = strlen(buf2);
+ #else
+ 		buf2 = buf;
+ #endif
+ 
+ 		/*
+ 		 * count trigrams
+ 		 */
+ 		tptr = make_trigrams(tptr, buf, bytelen, charlen);
+ #ifdef IGNORECASE
+ 		pfree(buf2);
+ #endif
+ 	}
+ 
+ 	pfree(buf);
+ 
+ 	if ((len = tptr - GETARR(trg)) == 0)
+ 		return trg;
+ 
+ 	if (len > 0)
+ 	{
+ 		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
+ 		len = unique_array(GETARR(trg), len);
+ 	}
+ 
+ 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
+ 
+ 	return trg;
+ }
+ 
  uint32
  trgm2int(trgm *ptr)
  {
***************
*** 340,345 **** cnt_sml(TRGM *trg1, TRGM *trg2)
--- 503,544 ----
  
  }
  
+ bool
+ trgm_contain(TRGM *trg1, TRGM *trg2)
+ {
+ 	trgm	   *ptr1,
+ 			   *ptr2;
+ 	int			count = 0;
+ 	int			len1,
+ 				len2;
+ 
+ 	ptr1 = GETARR(trg1);
+ 	ptr2 = GETARR(trg2);
+ 
+ 	len1 = ARRNELEM(trg1);
+ 	len2 = ARRNELEM(trg2);
+ 
+ 	while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
+ 	{
+ 		int			res = CMPTRGM(ptr1, ptr2);
+ 
+ 		if (res < 0)
+ 			return false;
+ 		else if (res > 0)
+ 			ptr2++;
+ 		else
+ 		{
+ 			ptr1++;
+ 			ptr2++;
+ 			count++;
+ 		}
+ 	}
+ 	if (ptr1 - GETARR(trg1) < len1)
+ 		return false;
+ 	else
+ 		return true;
+ }
+ 
  PG_FUNCTION_INFO_V1(similarity);
  Datum		similarity(PG_FUNCTION_ARGS);
  Datum
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to