Hackers, Here is first version of patch, which enable index support of wildcard search in pg_trgm contrib module. The idea of the patch is to extract from wildcard trigrams which should occurs in wildcard matching string. For example, for '%sector%' wildcard such trigrams would be: 'sec', 'ect', 'tor'.
create table words (word text); copy words from '/usr/share/dict/american-english'; test=# explain analyze select * from words where word ilike '%independ%'; QUERY PLAN ------------------------------------------------------------------------------------------------------ Seq Scan on words (cost=0.00..1703.11 rows=10 width=9) (actual time=18.818..174.146 rows=7 loops=1) Filter: (word ~~* '%independ%'::text) Total runtime: 174.200 ms (3 rows) CREATE INDEX trgm_idx ON words USING gist (word gist_trgm_ops); test=# explain analyze select * from words where word ilike '%independ%'; QUERY PLAN ------------------------------------------------------------------------------------------------------------------ Bitmap Heap Scan on words (cost=4.36..40.11 rows=10 width=9) (actual time=2.445..2.529 rows=7 loops=1) Recheck Cond: (word ~~* '%independ%'::text) -> Bitmap Index Scan on trgm_idx (cost=0.00..4.35 rows=10 width=0) (actual time=2.406..2.406 rows=7 loops=1) Index Cond: (word ~~* '%independ%'::text) Total runtime: 2.612 ms (5 rows) CREATE INDEX trgm_idx ON words USING gin (word gin_trgm_ops); test=# explain analyze select * from words where word ilike '%independ%'; QUERY PLAN ------------------------------------------------------------------------------------------------------------------- Bitmap Heap Scan on words (cost=76.08..111.83 rows=10 width=9) (actual time=2.675..2.755 rows=7 loops=1) Recheck Cond: (word ~~* '%independ%'::text) -> Bitmap Index Scan on trgm_idx (cost=0.00..76.07 rows=10 width=0) (actual time=2.642..2.642 rows=7 loops=1) Index Cond: (word ~~* '%independ%'::text) Total runtime: 2.839 ms (5 rows) I've encountered with following problems: 1) Indexing support for ilike is possible only with case-insensetive wildcards, e.g. when IGNORECASE macro is enabled. But I can't use this macro in pg_trgm.sql.in, where list of operators is defined. Probably, is it enuogh to put comment near IGNORECASE, which tells that if one disable this macro he should also remove oparators from pg_trgm.sql.in? 2) I found gist index not very useful with default SIGLENINT = 3. I've changed this value to 15 and I found gist index performs very good on dictionary. But on longer strings greater values of SIGLENINT may be required (probably even SIGLENINT > 122 will give benefit in some cases in spite of TOAST). ---- With best regards, Alexander Korotkov.
*** a/contrib/pg_trgm/pg_trgm.sql.in --- b/contrib/pg_trgm/pg_trgm.sql.in *************** *** 113,118 **** FOR TYPE text USING gist --- 113,120 ---- AS OPERATOR 1 % (text, text), OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 3 ~~ (text, text), + OPERATOR 4 ~~* (text, text), FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal), FUNCTION 2 gtrgm_union (bytea, internal), FUNCTION 3 gtrgm_compress (internal), *************** *** 144,149 **** CREATE OPERATOR CLASS gin_trgm_ops --- 146,153 ---- FOR TYPE text USING gin AS OPERATOR 1 % (text, text), + OPERATOR 3 ~~ (text, text), + OPERATOR 4 ~~* (text, text), FUNCTION 1 btint4cmp (int4, int4), FUNCTION 2 gin_extract_trgm (text, internal), FUNCTION 3 gin_extract_trgm (text, internal, int2, internal, internal), *** a/contrib/pg_trgm/trgm.h --- b/contrib/pg_trgm/trgm.h *************** *** 19,24 **** --- 19,26 ---- /* operator strategy numbers */ #define SimilarityStrategyNumber 1 #define DistanceStrategyNumber 2 + #define LikeStrategyNumber 3 + #define ILikeStrategyNumber 4 typedef char trgm[3]; *************** *** 53,59 **** typedef struct /* gist */ #define BITBYTE 8 ! #define SIGLENINT 3 /* >122 => key will toast, so very slow!!! */ #define SIGLEN ( sizeof(int)*SIGLENINT ) #define SIGLENBIT (SIGLEN*BITBYTE - 1) /* see makesign */ --- 55,61 ---- /* gist */ #define BITBYTE 8 ! #define SIGLENINT 15 /* >122 => key will toast, so very slow!!! */ #define SIGLEN ( sizeof(int)*SIGLENINT ) #define SIGLENBIT (SIGLEN*BITBYTE - 1) /* see makesign */ *************** *** 89,94 **** typedef char *BITVECP; --- 91,101 ---- extern float4 trgm_limit; TRGM *generate_trgm(char *str, int slen); + TRGM *generate_wildcard_trgm(char *str, int slen); float4 cnt_sml(TRGM *trg1, TRGM *trg2); + bool trgm_contain(TRGM *trg1, TRGM *trg2); + + #define ISESCAPECHAR(x) (*(x) == '\\') + #define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%') #endif /* __TRGM_H__ */ *** a/contrib/pg_trgm/trgm_gin.c --- b/contrib/pg_trgm/trgm_gin.c *************** *** 6,11 **** --- 6,12 ---- #include "trgm.h" #include "access/gin.h" + #include "access/skey.h" #include "access/itup.h" #include "access/tuptoaster.h" #include "storage/bufpage.h" *************** *** 24,36 **** gin_extract_trgm(PG_FUNCTION_ARGS) { text *val = (text *) PG_GETARG_TEXT_P(0); int32 *nentries = (int32 *) PG_GETARG_POINTER(1); Datum *entries = NULL; TRGM *trg; int4 trglen; *nentries = 0; ! trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ); trglen = ARRNELEM(trg); if (trglen > 0) --- 25,56 ---- { text *val = (text *) PG_GETARG_TEXT_P(0); int32 *nentries = (int32 *) PG_GETARG_POINTER(1); + StrategyNumber strategy; Datum *entries = NULL; TRGM *trg; int4 trglen; + if (fcinfo->nargs == 2) + strategy = SimilarityStrategyNumber; + else + strategy = PG_GETARG_UINT16(2); + *nentries = 0; ! switch (strategy) ! { ! case SimilarityStrategyNumber: ! trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ); ! break; ! case LikeStrategyNumber: ! case ILikeStrategyNumber: ! trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ); ! break; ! default: ! elog(ERROR, "unrecognized strategy number: %d", strategy); ! trg = NULL; /* keep compiler quiet */ ! break; ! } trglen = ARRNELEM(trg); if (trglen > 0) *************** *** 71,77 **** gin_trgm_consistent(PG_FUNCTION_ARGS) { bool *check = (bool *) PG_GETARG_POINTER(0); ! /* StrategyNumber strategy = PG_GETARG_UINT16(1); */ /* text *query = PG_GETARG_TEXT_P(2); */ /* int32 nkeys = PG_GETARG_INT32(3); */ Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); --- 91,97 ---- { bool *check = (bool *) PG_GETARG_POINTER(0); ! StrategyNumber strategy = PG_GETARG_UINT16(1); /* text *query = PG_GETARG_TEXT_P(2); */ /* int32 nkeys = PG_GETARG_INT32(3); */ Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); *************** *** 81,100 **** gin_trgm_consistent(PG_FUNCTION_ARGS) trglen, ntrue = 0; /* All cases served by this function are inexact */ *recheck = true; trglen = *(int32 *) extra_data; ! for (i = 0; i < trglen; i++) ! if (check[i]) ! ntrue++; #ifdef DIVUNION ! res = (trglen == ntrue) ? true : ((((((float4) ntrue) / ((float4) (trglen - ntrue)))) >= trgm_limit) ? true : false); #else ! res = (trglen == 0) ? false : ((((((float4) ntrue) / ((float4) trglen))) >= trgm_limit) ? true : false); #endif ! PG_RETURN_BOOL(res); } --- 101,144 ---- trglen, ntrue = 0; + #ifndef IGNORECASE + if (strategy == ILIKE_STRATEGY) + { + elog(ERROR, "Can't do ILIKE_STRATEGY with case-sensetive trigrams."); + } + #endif /* All cases served by this function are inexact */ *recheck = true; trglen = *(int32 *) extra_data; ! switch (strategy) ! { ! case SimilarityStrategyNumber: ! for (i = 0; i < trglen; i++) ! if (check[i]) ! ntrue++; #ifdef DIVUNION ! res = (trglen == ntrue) ? true : ((((((float4) ntrue) / ((float4) (trglen - ntrue)))) >= trgm_limit) ? true : false); #else ! res = (trglen == 0) ? false : ((((((float4) ntrue) / ((float4) trglen))) >= trgm_limit) ? true : false); #endif ! break; ! case LikeStrategyNumber: ! case ILikeStrategyNumber: ! res = true; ! for (i = 0; i < trglen; i++) ! if (!check[i]) ! { ! res = false; ! break; ! } ! break; ! default: ! elog(ERROR, "unrecognized strategy number: %d", strategy); ! res = false; /* keep compiler quiet */ ! break; ! } PG_RETURN_BOOL(res); } *** a/contrib/pg_trgm/trgm_gist.c --- b/contrib/pg_trgm/trgm_gist.c *************** *** 195,225 **** gtrgm_consistent(PG_FUNCTION_ARGS) TRGM *key = (TRGM *) DatumGetPointer(entry->key); TRGM *qtrg; bool res; ! char *cache = (char *) fcinfo->flinfo->fn_extra; ! ! /* All cases served by this function are exact */ ! *recheck = false; ! if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0) { ! qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ); if (cache) pfree(cache); fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, ! MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg)); cache = (char *) fcinfo->flinfo->fn_extra; ! memcpy(cache, query, VARSIZE(query)); ! memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg)); } ! qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query))); switch (strategy) { case SimilarityStrategyNumber: if (GIST_LEAF(entry)) { /* all leafs contains orig trgm */ float4 tmpsml = cnt_sml(key, qtrg); --- 195,248 ---- TRGM *key = (TRGM *) DatumGetPointer(entry->key); TRGM *qtrg; bool res; ! char *cache = (char *) fcinfo->flinfo->fn_extra, ! *cacheContents = cache + MAXALIGN(sizeof(StrategyNumber)); ! #ifndef IGNORECASE ! if (strategy == ILIKE_STRATEGY) ! { ! elog(ERROR, "Can't do ILIKE_STRATEGY with case-sensetive trigrams."); ! } ! #endif ! if (cache == NULL || strategy != *((StrategyNumber *)cache) || ! VARSIZE(cacheContents) != VARSIZE(query) || ! memcmp(cacheContents, query, VARSIZE(query)) != 0) { ! switch (strategy) ! { ! case SimilarityStrategyNumber: ! qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ); ! break; ! case LikeStrategyNumber: ! case ILikeStrategyNumber: ! qtrg = generate_wildcard_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ); ! break; ! default: ! elog(ERROR, "unrecognized strategy number: %d", strategy); ! qtrg = NULL; /* keep compiler quiet */ ! break; ! } if (cache) pfree(cache); fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, ! MAXALIGN(sizeof(StrategyNumber)) + MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg)); cache = (char *) fcinfo->flinfo->fn_extra; + cacheContents = cache + MAXALIGN(sizeof(StrategyNumber)); ! memcpy(cache, &strategy, sizeof(StrategyNumber)); ! memcpy(cacheContents, query, VARSIZE(query)); ! memcpy(cacheContents + MAXALIGN(VARSIZE(query)), ! qtrg, VARSIZE(qtrg)); } ! qtrg = (TRGM *) (cacheContents + MAXALIGN(VARSIZE(query))); switch (strategy) { case SimilarityStrategyNumber: + *recheck = false; if (GIST_LEAF(entry)) { /* all leafs contains orig trgm */ float4 tmpsml = cnt_sml(key, qtrg); *************** *** 242,247 **** gtrgm_consistent(PG_FUNCTION_ARGS) --- 265,298 ---- res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false; } break; + case LikeStrategyNumber: + case ILikeStrategyNumber: + *recheck = true; + if (GIST_LEAF(entry)) + { /* all leafs contains orig trgm */ + res = trgm_contain(qtrg, key); + } + else if (ISALLTRUE(key)) + { /* non-leaf contains signature */ + res = true; + } + else + { /* non-leaf contains signature */ + int4 k, tmp = 0, len = ARRNELEM(qtrg); + trgm *ptr = GETARR(qtrg); + BITVECP sign = GETSIGN(key); + res = true; + for (k = 0; k < len; k++) + { + CPTRGM(((char *) &tmp), ptr + k); + if (!GETBIT(sign, HASHVAL(tmp))) + { + res = false; + break; + } + } + } + break; default: elog(ERROR, "unrecognized strategy number: %d", strategy); res = false; /* keep compiler quiet */ *** a/contrib/pg_trgm/trgm_op.c --- b/contrib/pg_trgm/trgm_op.c *************** *** 236,241 **** generate_trgm(char *str, int slen) --- 236,404 ---- return trg; } + static char * + get_wildcard_part(char *str, int lenstr, char *buf, int *bytelen, int *charlen) + { + char *beginword = str, *endword, *s = buf; + bool in_wildcard = false, in_escape = false; + int clen; + + while (beginword - str < lenstr) + { + if (in_escape) + { + in_escape = false; + in_wildcard = false; + if (iswordchr(beginword)) break; + } + else + { + if (ISESCAPECHAR(beginword)) + in_escape = true; + else if (ISWILDCARDCHAR(beginword)) + in_wildcard = true; + else if (iswordchr(beginword)) + break; + else + in_wildcard = false; + } + beginword += pg_mblen(beginword); + } + *charlen = 0; + if (!in_wildcard) + { + if (LPADDING > 0) + { + *s++ = ' '; + (*charlen)++; + if (LPADDING > 1) + { + *s++ = ' '; + (*charlen)++; + } + } + } + if (beginword - str >= lenstr) + return NULL; + + endword = beginword; + in_wildcard = false; + in_escape = false; + while (endword - str < lenstr) + { + clen = pg_mblen(endword); + if (in_escape) + { + in_escape = false; + in_wildcard = false; + if (iswordchr(endword)) + { + (*charlen)++; + memcpy(s, endword, clen); + s += clen; + } + else + break; + } + else + { + if (ISESCAPECHAR(endword)) + in_escape = true; + else if (ISWILDCARDCHAR(endword)) + { + in_wildcard = true; + break; + } + else if (iswordchr(endword)) + { + (*charlen)++; + memcpy(s, endword, clen); + s += clen; + } + else + { + in_wildcard = false; + break; + } + } + endword += clen; + } + if (!in_wildcard) + { + if (RPADDING > 0) + { + *s++ = ' '; + (*charlen)++; + if (RPADDING > 1) + { + *s++ = ' '; + (*charlen)++; + } + } + } + *bytelen = s - buf; + return endword; + } + + TRGM * + generate_wildcard_trgm(char *str, int slen) + { + TRGM *trg; + char *buf, + *buf2; + trgm *tptr; + int len, + charlen, + bytelen; + char *eword; + + trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3); + trg->flag = ARRKEY; + SET_VARSIZE(trg, TRGMHDRSIZE); + + if (slen + LPADDING + RPADDING < 3 || slen == 0) + return trg; + + tptr = GETARR(trg); + + buf = palloc(sizeof(char) * (slen + 4)); + + eword = str; + while ((eword = get_wildcard_part(eword, slen - (eword - str), + buf, &bytelen, &charlen)) != NULL) + { + #ifdef IGNORECASE + buf2 = lowerstr_with_len(buf, bytelen); + bytelen = strlen(buf2); + #else + buf2 = buf; + #endif + + /* + * count trigrams + */ + tptr = make_trigrams(tptr, buf, bytelen, charlen); + #ifdef IGNORECASE + pfree(buf2); + #endif + } + + pfree(buf); + + if ((len = tptr - GETARR(trg)) == 0) + return trg; + + if (len > 0) + { + qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm); + len = unique_array(GETARR(trg), len); + } + + SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); + + return trg; + } + uint32 trgm2int(trgm *ptr) { *************** *** 340,345 **** cnt_sml(TRGM *trg1, TRGM *trg2) --- 503,544 ---- } + bool + trgm_contain(TRGM *trg1, TRGM *trg2) + { + trgm *ptr1, + *ptr2; + int count = 0; + int len1, + len2; + + ptr1 = GETARR(trg1); + ptr2 = GETARR(trg2); + + len1 = ARRNELEM(trg1); + len2 = ARRNELEM(trg2); + + while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2) + { + int res = CMPTRGM(ptr1, ptr2); + + if (res < 0) + return false; + else if (res > 0) + ptr2++; + else + { + ptr1++; + ptr2++; + count++; + } + } + if (ptr1 - GETARR(trg1) < len1) + return false; + else + return true; + } + PG_FUNCTION_INFO_V1(similarity); Datum similarity(PG_FUNCTION_ARGS); Datum
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers