Hi! On Mon, Jan 31, 2011 at 12:52 AM, Jan Urbański <wulc...@wulczer.org> wrote:
> I saw that the code tries to handle ILIKE searches, but apparently it's > failing somewhere. > It was just a typo. Corrected version attached. ---- With best regards, Alexander Korotkov.
*** a/contrib/pg_trgm/pg_trgm.sql.in --- b/contrib/pg_trgm/pg_trgm.sql.in *************** *** 113,118 **** FOR TYPE text USING gist --- 113,120 ---- AS OPERATOR 1 % (text, text), OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops, + OPERATOR 3 ~~ (text, text), + OPERATOR 4 ~~* (text, text), FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal), FUNCTION 2 gtrgm_union (bytea, internal), FUNCTION 3 gtrgm_compress (internal), *************** *** 129,140 **** RETURNS internal AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT; ! CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal, int2, internal, internal) RETURNS internal AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT; ! CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal) RETURNS bool AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT; --- 131,142 ---- AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT; ! CREATE OR REPLACE FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal) RETURNS internal AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT; ! CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal) RETURNS bool AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT; *************** *** 144,151 **** CREATE OPERATOR CLASS gin_trgm_ops FOR TYPE text USING gin AS OPERATOR 1 % (text, text), FUNCTION 1 btint4cmp (int4, int4), FUNCTION 2 gin_extract_trgm (text, internal), ! FUNCTION 3 gin_extract_trgm (text, internal, int2, internal, internal), ! FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal), STORAGE int4; --- 146,155 ---- FOR TYPE text USING gin AS OPERATOR 1 % (text, text), + OPERATOR 3 ~~ (text, text), + OPERATOR 4 ~~* (text, text), FUNCTION 1 btint4cmp (int4, int4), FUNCTION 2 gin_extract_trgm (text, internal), ! FUNCTION 3 gin_extract_query_trgm (text, internal, int2, internal, internal, internal, internal), ! FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal, internal, internal), STORAGE int4; *** a/contrib/pg_trgm/trgm.h --- b/contrib/pg_trgm/trgm.h *************** *** 13,24 **** --- 13,32 ---- #define LPADDING 2 #define RPADDING 1 #define KEEPONLYALNUM + /* + * IGNORECASE macro means that trigrams is case-insensetive. If this macro is + * disabled, then ~~* operator should be excluded from operator class, because + * we can't handle case-insensetive wildcard search with case-sensetive + * trigrams. + */ #define IGNORECASE #define DIVUNION /* operator strategy numbers */ #define SimilarityStrategyNumber 1 #define DistanceStrategyNumber 2 + #define LikeStrategyNumber 3 + #define ILikeStrategyNumber 4 typedef char trgm[3]; *************** *** 53,59 **** typedef struct /* gist */ #define BITBYTE 8 ! #define SIGLENINT 3 /* >122 => key will toast, so very slow!!! */ #define SIGLEN ( sizeof(int)*SIGLENINT ) #define SIGLENBIT (SIGLEN*BITBYTE - 1) /* see makesign */ --- 61,67 ---- /* gist */ #define BITBYTE 8 ! #define SIGLENINT 15 /* >122 => key will toast, so very slow!!! */ #define SIGLEN ( sizeof(int)*SIGLENINT ) #define SIGLENBIT (SIGLEN*BITBYTE - 1) /* see makesign */ *************** *** 89,94 **** typedef char *BITVECP; --- 97,107 ---- extern float4 trgm_limit; TRGM *generate_trgm(char *str, int slen); + TRGM *generate_wildcard_trgm(char *str, int slen); float4 cnt_sml(TRGM *trg1, TRGM *trg2); + bool trgm_contain(TRGM *trg1, TRGM *trg2); + + #define ISESCAPECHAR(x) (*(x) == '\\') /* Wildcard escape character */ + #define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%') /* Wildcard meta-character */ #endif /* __TRGM_H__ */ *** a/contrib/pg_trgm/trgm_gin.c --- b/contrib/pg_trgm/trgm_gin.c *************** *** 6,11 **** --- 6,12 ---- #include "trgm.h" #include "access/gin.h" + #include "access/skey.h" #include "access/itup.h" #include "access/tuptoaster.h" #include "storage/bufpage.h" *************** *** 16,21 **** --- 17,25 ---- PG_FUNCTION_INFO_V1(gin_extract_trgm); Datum gin_extract_trgm(PG_FUNCTION_ARGS); + PG_FUNCTION_INFO_V1(gin_extract_query_trgm); + Datum gin_extract_query_trgm(PG_FUNCTION_ARGS); + PG_FUNCTION_INFO_V1(gin_trgm_consistent); Datum gin_trgm_consistent(PG_FUNCTION_ARGS); *************** *** 58,90 **** gin_extract_trgm(PG_FUNCTION_ARGS) } Datum gin_trgm_consistent(PG_FUNCTION_ARGS) { bool *check = (bool *) PG_GETARG_POINTER(0); ! /* StrategyNumber strategy = PG_GETARG_UINT16(1); */ /* text *query = PG_GETARG_TEXT_P(2); */ ! int32 nkeys = PG_GETARG_INT32(3); ! /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ bool *recheck = (bool *) PG_GETARG_POINTER(5); bool res = FALSE; int32 i, ! ntrue = 0; /* All cases served by this function are inexact */ *recheck = true; ! /* Count the matches */ ! for (i = 0; i < nkeys; i++) { ! if (check[i]) ! ntrue++; ! } #ifdef DIVUNION ! res = (nkeys == ntrue) ? true : ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >= trgm_limit) ? true : false); #else ! res = (nkeys == 0) ? false : ((((((float4) ntrue) / ((float4) nkeys))) >= trgm_limit) ? true : false); #endif ! PG_RETURN_BOOL(res); } --- 62,182 ---- } Datum + gin_extract_query_trgm(PG_FUNCTION_ARGS) + { + text *val = (text *) PG_GETARG_TEXT_P(0); + int32 *nentries = (int32 *) PG_GETARG_POINTER(1); + StrategyNumber strategy = PG_GETARG_UINT16(2); + Datum *entries = NULL; + TRGM *trg; + int4 trglen; + int32 **extra_data = (int32 **) PG_GETARG_POINTER(4); + int32 *searchMode = (int32 *)PG_GETARG_POINTER(6); + trgm *ptr; + int4 i = 0, + item; + + switch (strategy) + { + case SimilarityStrategyNumber: + trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ); + break; + case LikeStrategyNumber: + case ILikeStrategyNumber: + /* + * For wildcard search we should extract all the trigrams, which + * every wildcard conforming string should include. + */ + trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", strategy); + trg = NULL; /* keep compiler quiet */ + break; + } + trglen = ARRNELEM(trg); + + *nentries = (int32) trglen; + + if (trglen > 0) + { + entries = (Datum *) palloc(sizeof(Datum) * trglen); + ptr = GETARR(trg); + while (ptr - GETARR(trg) < ARRNELEM(trg)) + { + item = trgm2int(ptr); + entries[i++] = Int32GetDatum(item); + + ptr++; + } + } + + *extra_data = (int32 *) palloc0(sizeof(int32)); + **extra_data = trglen; + + /* + * If no trigrams was extracted then we have to scan all the index. + */ + if (trglen == 0) + *searchMode = GIN_SEARCH_MODE_ALL; + + PG_RETURN_POINTER(entries); + } + + Datum gin_trgm_consistent(PG_FUNCTION_ARGS) { bool *check = (bool *) PG_GETARG_POINTER(0); ! StrategyNumber strategy = PG_GETARG_UINT16(1); /* text *query = PG_GETARG_TEXT_P(2); */ ! /* int32 nkeys = PG_GETARG_INT32(3); */ ! int32 *extra_data = (int32 *) PG_GETARG_POINTER(4); bool *recheck = (bool *) PG_GETARG_POINTER(5); bool res = FALSE; int32 i, ! ntrue = 0, ! trglen; + #ifndef IGNORECASE + if (strategy == ILIKE_STRATEGY) + { + elog(ERROR, "Can't do ILIKE_STRATEGY with case-sensetive trigrams."); + } + #endif /* All cases served by this function are inexact */ *recheck = true; ! trglen = *extra_data; ! ! switch (strategy) { ! case SimilarityStrategyNumber: ! /* Count the matches */ ! for (i = 0; i < trglen; i++) ! if (check[i]) ! ntrue++; #ifdef DIVUNION ! res = (trglen == ntrue) ? true : ((((((float4) ntrue) / ((float4) (trglen - ntrue)))) >= trgm_limit) ? true : false); #else ! res = (trglen == 0) ? false : ((((((float4) ntrue) / ((float4) trglen))) >= trgm_limit) ? true : false); #endif ! break; ! case LikeStrategyNumber: ! case ILikeStrategyNumber: ! /* Check if all extracted trigrams are presented. */ ! res = true; ! for (i = 0; i < trglen; i++) ! if (!check[i]) ! { ! res = false; ! break; ! } ! break; ! default: ! elog(ERROR, "unrecognized strategy number: %d", strategy); ! res = false; /* keep compiler quiet */ ! break; ! } PG_RETURN_BOOL(res); } *** a/contrib/pg_trgm/trgm_gist.c --- b/contrib/pg_trgm/trgm_gist.c *************** *** 195,225 **** gtrgm_consistent(PG_FUNCTION_ARGS) TRGM *key = (TRGM *) DatumGetPointer(entry->key); TRGM *qtrg; bool res; ! char *cache = (char *) fcinfo->flinfo->fn_extra; ! ! /* All cases served by this function are exact */ ! *recheck = false; ! ! if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0) { ! qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ); if (cache) pfree(cache); fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, ! MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg)); cache = (char *) fcinfo->flinfo->fn_extra; ! memcpy(cache, query, VARSIZE(query)); ! memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg)); } ! qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query))); switch (strategy) { case SimilarityStrategyNumber: if (GIST_LEAF(entry)) { /* all leafs contains orig trgm */ float4 tmpsml = cnt_sml(key, qtrg); --- 195,254 ---- TRGM *key = (TRGM *) DatumGetPointer(entry->key); TRGM *qtrg; bool res; ! char *cache = (char *) fcinfo->flinfo->fn_extra, ! *cacheContents = cache + MAXALIGN(sizeof(StrategyNumber)); ! #ifndef IGNORECASE ! if (strategy == ILIKE_STRATEGY) { ! elog(ERROR, "Can't do ILIKE_STRATEGY with case-sensetive trigrams."); ! } ! #endif ! ! /* ! * Store the both strategy number and extracted trigrams in cache, because ! * trigrams extraction is relatively CPU-expensive. We should store ! * strategy number, because trigrams extraction depends on strategy. ! */ ! if (cache == NULL || strategy != *((StrategyNumber *)cache) || ! VARSIZE(cacheContents) != VARSIZE(query) || ! memcmp(cacheContents, query, VARSIZE(query)) != 0) ! { ! switch (strategy) ! { ! case SimilarityStrategyNumber: ! qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ); ! break; ! case LikeStrategyNumber: ! case ILikeStrategyNumber: ! qtrg = generate_wildcard_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ); ! break; ! default: ! elog(ERROR, "unrecognized strategy number: %d", strategy); ! qtrg = NULL; /* keep compiler quiet */ ! break; ! } if (cache) pfree(cache); fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, ! MAXALIGN(sizeof(StrategyNumber)) + MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg)); cache = (char *) fcinfo->flinfo->fn_extra; + cacheContents = cache + MAXALIGN(sizeof(StrategyNumber)); ! memcpy(cache, &strategy, sizeof(StrategyNumber)); ! memcpy(cacheContents, query, VARSIZE(query)); ! memcpy(cacheContents + MAXALIGN(VARSIZE(query)), ! qtrg, VARSIZE(qtrg)); } ! qtrg = (TRGM *) (cacheContents + MAXALIGN(VARSIZE(query))); switch (strategy) { case SimilarityStrategyNumber: + /* Similarity search is exact. */ + *recheck = false; if (GIST_LEAF(entry)) { /* all leafs contains orig trgm */ float4 tmpsml = cnt_sml(key, qtrg); *************** *** 242,247 **** gtrgm_consistent(PG_FUNCTION_ARGS) --- 271,308 ---- res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false; } break; + case LikeStrategyNumber: + case ILikeStrategyNumber: + /* + * Wildcard search is inexact. It checks if all of extracted + * trigrams can be present if child nodes. + */ + *recheck = true; + if (GIST_LEAF(entry)) + { /* all leafs contains orig trgm */ + res = trgm_contain(qtrg, key); + } + else if (ISALLTRUE(key)) + { /* non-leaf contains signature */ + res = true; + } + else + { /* non-leaf contains signature */ + int4 k, tmp = 0, len = ARRNELEM(qtrg); + trgm *ptr = GETARR(qtrg); + BITVECP sign = GETSIGN(key); + res = true; + for (k = 0; k < len; k++) + { + CPTRGM(((char *) &tmp), ptr + k); + if (!GETBIT(sign, HASHVAL(tmp))) + { + res = false; + break; + } + } + } + break; default: elog(ERROR, "unrecognized strategy number: %d", strategy); res = false; /* keep compiler quiet */ *** a/contrib/pg_trgm/trgm_op.c --- b/contrib/pg_trgm/trgm_op.c *************** *** 236,241 **** generate_trgm(char *str, int slen) --- 236,443 ---- return trg; } + /* + * Extracts part of wildcard, bounded by '_' and '%' meta-characters, non-word + * characters or string boundaries, into buf. Returns pointer to the end of + * part. Writes byte and character length of found part into correspondingly + * bytelen and charlen. If a wildcard part is bounded by non-word characters or + * string boundaries then this function will include corresponding padding + * spaces into part. + */ + static char * + get_wildcard_part(char *str, int lenstr, char *buf, int *bytelen, int *charlen) + { + char *beginword = str, *endword, *s = buf; + bool in_wildcard_meta = false, in_escape = false; + int clen; + + /* + * Find the first word character remembering whether last charater was + * wildcard meta-character. + */ + while (beginword - str < lenstr) + { + if (in_escape) + { + in_escape = false; + in_wildcard_meta = false; + if (iswordchr(beginword)) break; + } + else + { + if (ISESCAPECHAR(beginword)) + in_escape = true; + else if (ISWILDCARDCHAR(beginword)) + in_wildcard_meta = true; + else if (iswordchr(beginword)) + break; + else + in_wildcard_meta = false; + } + beginword += pg_mblen(beginword); + } + + /* + * Add left padding spaces if last chataster wasn't wildcard meta-character. + */ + *charlen = 0; + if (!in_wildcard_meta) + { + if (LPADDING > 0) + { + *s++ = ' '; + (*charlen)++; + if (LPADDING > 1) + { + *s++ = ' '; + (*charlen)++; + } + } + } + + /* + * Handle string end. + */ + if (beginword - str >= lenstr) + return NULL; + + /* + * Copy part of wildcard into buf until wildcard meta-character, non-word + * character or string boundary. Escapes is stripping during copy. + */ + endword = beginword; + in_wildcard_meta = false; + in_escape = false; + while (endword - str < lenstr) + { + clen = pg_mblen(endword); + if (in_escape) + { + in_escape = false; + in_wildcard_meta = false; + if (iswordchr(endword)) + { + (*charlen)++; + memcpy(s, endword, clen); + s += clen; + } + else + break; + } + else + { + if (ISESCAPECHAR(endword)) + in_escape = true; + else if (ISWILDCARDCHAR(endword)) + { + in_wildcard_meta = true; + break; + } + else if (iswordchr(endword)) + { + (*charlen)++; + memcpy(s, endword, clen); + s += clen; + } + else + { + in_wildcard_meta = false; + break; + } + } + endword += clen; + } + + /* + * Add right padding spaces if last chataster wasn't wildcard meta-character. + */ + if (!in_wildcard_meta) + { + if (RPADDING > 0) + { + *s++ = ' '; + (*charlen)++; + if (RPADDING > 1) + { + *s++ = ' '; + (*charlen)++; + } + } + } + *bytelen = s - buf; + return endword; + } + + /* + * Generates trigrams for wildcard. Returns array of trigrams, which must occur + * in any string, which conforms to wildcard. For example, from "a%bcd%" pattern + * trigrams " a", "bcd" would be extracted. + */ + TRGM * + generate_wildcard_trgm(char *str, int slen) + { + TRGM *trg; + char *buf, + *buf2; + trgm *tptr; + int len, + charlen, + bytelen; + char *eword; + + trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3); + trg->flag = ARRKEY; + SET_VARSIZE(trg, TRGMHDRSIZE); + + if (slen + LPADDING + RPADDING < 3 || slen == 0) + return trg; + + tptr = GETARR(trg); + + buf = palloc(sizeof(char) * (slen + 4)); + + /* + * Extract trigrams from every wildcard part extracted by get_wildcard_part. + */ + eword = str; + while ((eword = get_wildcard_part(eword, slen - (eword - str), + buf, &bytelen, &charlen)) != NULL) + { + #ifdef IGNORECASE + buf2 = lowerstr_with_len(buf, bytelen); + bytelen = strlen(buf2); + #else + buf2 = buf; + #endif + + /* + * count trigrams + */ + tptr = make_trigrams(tptr, buf2, bytelen, charlen); + #ifdef IGNORECASE + pfree(buf2); + #endif + } + + pfree(buf); + + if ((len = tptr - GETARR(trg)) == 0) + return trg; + + /* + * Make trigrams unique. + */ + if (len > 0) + { + qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm); + len = unique_array(GETARR(trg), len); + } + + SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); + + return trg; + } + uint32 trgm2int(trgm *ptr) { *************** *** 340,345 **** cnt_sml(TRGM *trg1, TRGM *trg2) --- 542,586 ---- } + /* + * Returns whether trg2 contains all trigrams from trg1. + */ + bool + trgm_contain(TRGM *trg1, TRGM *trg2) + { + trgm *ptr1, + *ptr2; + int count = 0; + int len1, + len2; + + ptr1 = GETARR(trg1); + ptr2 = GETARR(trg2); + + len1 = ARRNELEM(trg1); + len2 = ARRNELEM(trg2); + + while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2) + { + int res = CMPTRGM(ptr1, ptr2); + + if (res < 0) + return false; + else if (res > 0) + ptr2++; + else + { + ptr1++; + ptr2++; + count++; + } + } + if (ptr1 - GETARR(trg1) < len1) + return false; + else + return true; + } + PG_FUNCTION_INFO_V1(similarity); Datum similarity(PG_FUNCTION_ARGS); Datum *** a/contrib/pg_trgm/uninstall_pg_trgm.sql --- b/contrib/pg_trgm/uninstall_pg_trgm.sql *************** *** 27,35 **** DROP OPERATOR CLASS gin_trgm_ops USING gin; DROP FUNCTION gin_extract_trgm(text, internal); ! DROP FUNCTION gin_extract_trgm(text, internal, int2, internal, internal); ! DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal); DROP OPERATOR % (text, text); --- 27,35 ---- DROP FUNCTION gin_extract_trgm(text, internal); ! DROP FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal); ! DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal); DROP OPERATOR % (text, text);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers