2010/9/1 David Fetter <da...@fetter.org>: > On Tue, Aug 31, 2010 at 11:07:40PM +0200, Pavel Stehule wrote: >> Hello >> >> attached WIP patch. > > I don't see it attached. Is it just me?
sorry, it was at 1 ofter midnight Regards Pavel > > Cheers, > David. > -- > David Fetter <da...@fetter.org> http://fetter.org/ > Phone: +1 415 235 3778 AIM: dfetter666 Yahoo!: dfetter > Skype: davidfetter XMPP: david.fet...@gmail.com > iCal: webcal://www.tripit.com/feed/ical/people/david74/tripit.ics > > Remember to vote! > Consider donating to Postgres: http://www.postgresql.org/about/donate >
*** ./src/backend/tsearch/dict_ispell.c.orig 2010-08-23 09:16:49.000000000 +0200 --- ./src/backend/tsearch/dict_ispell.c 2010-08-31 23:46:00.178669635 +0200 *************** *** 37,113 **** dictloaded = false, stoploaded = false; ListCell *l; d = (DictISpell *) palloc0(sizeof(DictISpell)); ! foreach(l, dictoptions) { ! DefElem *defel = (DefElem *) lfirst(l); ! ! if (pg_strcasecmp(defel->defname, "DictFile") == 0) { ! if (dictloaded) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ! errmsg("multiple DictFile parameters"))); ! NIImportDictionary(&(d->obj), ! get_tsearch_config_filename(defGetString(defel), ! "dict")); ! dictloaded = true; } ! else if (pg_strcasecmp(defel->defname, "AffFile") == 0) { ! if (affloaded) ! ereport(ERROR, ! (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ! errmsg("multiple AffFile parameters"))); ! NIImportAffixes(&(d->obj), ! get_tsearch_config_filename(defGetString(defel), ! "affix")); ! affloaded = true; } ! else if (pg_strcasecmp(defel->defname, "StopWords") == 0) { ! if (stoploaded) ! ereport(ERROR, ! (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ! errmsg("multiple StopWords parameters"))); ! readstoplist(defGetString(defel), &(d->stoplist), lowerstr); ! stoploaded = true; } else { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ! errmsg("unrecognized Ispell parameter: \"%s\"", ! defel->defname))); } ! } - if (affloaded && dictloaded) - { - NISortDictionary(&(d->obj)); - NISortAffixes(&(d->obj)); - } - else if (!affloaded) - { - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("missing AffFile parameter"))); - } - else - { - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("missing DictFile parameter"))); } MemoryContextDeleteChildren(CurrentMemoryContext); MemoryContextStats(CurrentMemoryContext); - PG_RETURN_POINTER(d); } --- 37,132 ---- dictloaded = false, stoploaded = false; ListCell *l; + int i; d = (DictISpell *) palloc0(sizeof(DictISpell)); + + d->obj.stream = fopen("/tmp/xxx.ft", "r"); + d->obj.mode = 'r'; ! if (d->obj.mode == 'r') { ! readSPDict(d->obj.stream, &d->obj); ! readAffix(d->obj.stream, &d->obj); ! postProcessAffixes(&d->obj); ! readStopList(d->obj.stream, &d->stoplist); ! } ! else ! { ! foreach(l, dictoptions) { ! DefElem *defel = (DefElem *) lfirst(l); ! ! if (pg_strcasecmp(defel->defname, "DictFile") == 0) ! { ! if (dictloaded) ! ereport(ERROR, ! (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ! errmsg("multiple DictFile parameters"))); ! NIImportDictionary(&(d->obj), ! get_tsearch_config_filename(defGetString(defel), ! "dict")); ! dictloaded = true; ! } ! else if (pg_strcasecmp(defel->defname, "AffFile") == 0) ! { ! if (affloaded) ! ereport(ERROR, ! (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ! errmsg("multiple AffFile parameters"))); ! NIImportAffixes(&(d->obj), ! get_tsearch_config_filename(defGetString(defel), ! "affix")); ! affloaded = true; ! } ! else if (pg_strcasecmp(defel->defname, "StopWords") == 0) ! { ! if (stoploaded) ! ereport(ERROR, ! (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ! errmsg("multiple StopWords parameters"))); ! readstoplist(defGetString(defel), &(d->stoplist), lowerstr); ! stoploaded = true; ! ! } ! else ! { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ! errmsg("unrecognized Ispell parameter: \"%s\"", ! defel->defname))); ! } } ! ! if (affloaded && dictloaded) { ! NISortDictionary(&(d->obj)); ! NISortAffixes(&(d->obj)); } ! else if (!affloaded) { ! ereport(ERROR, ! (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ! errmsg("missing AffFile parameter"))); } else { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ! errmsg("missing DictFile parameter"))); } ! ! if (d->obj.stream != NULL && d->obj.mode == 'w') ! outStopList(d->obj.stream, &d->stoplist); } MemoryContextDeleteChildren(CurrentMemoryContext); MemoryContextStats(CurrentMemoryContext); + fclose(d->obj.stream); PG_RETURN_POINTER(d); } *** ./src/backend/tsearch/spell.c.orig 2010-01-02 17:57:53.000000000 +0100 --- ./src/backend/tsearch/spell.c 2010-08-31 23:55:16.054672520 +0200 *************** *** 11,23 **** * *------------------------------------------------------------------------- */ - #include "postgres.h" #include "tsearch/dicts/spell.h" #include "tsearch/ts_locale.h" #include "utils/memutils.h" /* * Initialization requires a lot of memory that's not needed --- 11,26 ---- * *------------------------------------------------------------------------- */ #include "postgres.h" #include "tsearch/dicts/spell.h" #include "tsearch/ts_locale.h" + #include "tsearch/ts_public.h" #include "utils/memutils.h" + #include <stdio.h> + #include <time.h> + /* * Initialization requires a lot of memory that's not needed *************** *** 28,36 **** --- 31,367 ---- */ static MemoryContext tmpCtx = NULL; + static void *prealloc_mem = NULL; + static Size prealloc_free_size; + + static void checkTmpCtx(void); + #define tmpalloc(sz) MemoryContextAlloc(tmpCtx, (sz)) #define tmpalloc0(sz) MemoryContextAllocZero(tmpCtx, (sz)) + #define WRITE_BINARY(buff, stream) \ + do { \ + if (fwrite(&(buff), sizeof(buff), 1, stream) != 1) \ + elog(ERROR, "cannot to write to prepared dictionary file"); \ + } while (0); + + #define WRITE_STRING(buff, stream) \ + do { \ + int len = -1; \ + if ((buff) != NULL) \ + { \ + int len = strlen(buff) + 1; \ + WRITE_BINARY(len, stream); \ + if (fwrite(buff, len, 1, stream) != 1) \ + elog(ERROR, "cannot to write to prepared dictionary file"); \ + } \ + else \ + { \ + WRITE_BINARY(len, stream); \ + } \ + } while (0); + + #define WRITE_BINARY_STRING(buff, size, stream) \ + do { \ + if (fwrite(buff, size, 1, stream) != 1) \ + elog(ERROR, "cannot to write to prepared dictionary file"); \ + } while (0); + + #define READ_BINARY(buff, stream) \ + do { \ + if (fread(&(buff), sizeof(buff), 1, stream) != 1) \ + elog(ERROR, "cannot to load a prepared dictionary file"); \ + } while (0) + + #define READ_STRING(target, stream) \ + do { \ + int len; \ + READ_BINARY(len, stream); \ + if (len != -1) \ + { \ + target = (char *) palloc(len); \ + if (fread(target, len, 1, stream) != 1) \ + elog(ERROR, "cannot to load a prepared dictionary file"); \ + } \ + else \ + target = NULL; \ + } while (0) + + #define READ_BINARY_STRING(buff, size, stream) \ + do { \ + if (fread(buff, size, 1, stream) != 1) \ + elog(ERROR, "cannot to load a prepared dictionary file"); \ + } while(0); + + /* + * spell dictionary uses a thousands SPNodes. These nodes are never + * individually released, so we can pass by memory context managament + * and solve a interesting size of memory. + */ + static SPNode * + allocSPNode(int nchar) + { + Size size = MAXALIGN(SPNHDRSZ + nchar * sizeof(SPNodeData)); + void *ret; + + /* use a prealloc_mem only for small requests */ + if (size > ALLOCSET_DEFAULT_INITSIZE / 3) + return palloc(size); + + if (prealloc_mem == NULL || size > prealloc_free_size) + { + prealloc_mem = palloc(ALLOCSET_DEFAULT_INITSIZE); + prealloc_free_size = ALLOCSET_DEFAULT_INITSIZE; + } + + Assert(prealloc_mem != NULL); + Assert(prealloc_mem == (void *) MAXALIGN(prealloc_mem)); + + ret = memset(prealloc_mem, 0, size); + + /* reduce a used block from preallocated memory */ + prealloc_free_size -= size; + prealloc_mem = (char *) prealloc_mem + size; + + return ret; + } + + /* + * Parsing a spell dictionary is slow, so we must to mimimalize + * the number of this task. One possibility is serialisation + * and deseralisation of Ispell dictionary. + */ + static void + outSPNode(FILE *stream, SPNode *node) + { + int i; + uint32 length = node->length; + + WRITE_BINARY(length, stream); + + for (i = 0; i < node->length; i++) + { + SPNodeData *data = &node->data[i]; + uint32 aux = data->val | data->isword << 8 + | data->compoundflag << 9 | data->affix << 13; + + WRITE_BINARY(aux, stream); + + if (data->node) + outSPNode(stream, data->node); + else + { + length = 0; + WRITE_BINARY(length, stream); + } + } + } + + static SPNode * + readSPNode(FILE *stream) + { + int i; + uint32 length; + SPNode *node; + + READ_BINARY(length, stream); + + /* there are not other node */ + if (length == 0) + return NULL; + + node = allocSPNode(length); + node->length = length; + + for (i = 0; i < node->length; i++) + { + SPNodeData *data = &node->data[i]; + uint32 aux; + + READ_BINARY(aux, stream); + + data->val = aux & 0xFF; + data->isword = aux >> 8 & 1; + data->compoundflag = aux >> 9 & 0xF; + data->affix = aux >> 13 & 0x7FFFF; + + data->node = readSPNode(stream); + } + + return node; + } + + static void + outSPDict(FILE *stream, IspellDict *Conf) + { + int i; + + WRITE_BINARY(Conf->nAffixData, stream); + + for (i = 0; i < Conf->nAffixData; i++) + { + WRITE_STRING(Conf->AffixData[i], stream); + } + + outSPNode(stream, Conf->Dictionary); + } + + void + readSPDict(FILE *stream, IspellDict *Conf) + { + int i; + + checkTmpCtx(); + + READ_BINARY(Conf->nAffixData, stream); + + Conf->AffixData = (char **) palloc(Conf->nAffixData * sizeof(char *)); + + for (i = 0; i < Conf->nAffixData; i++) + { + READ_STRING(Conf->AffixData[i], stream); + } + + Conf->Dictionary = readSPNode(stream); + } + + static void + outRegisNode(FILE *stream, RegisNode *node) + { + do + { + int len = node->len; + uint32 aux = node->type | node->len << 2; + + WRITE_BINARY(len, stream); + WRITE_BINARY(aux, stream); + WRITE_BINARY_STRING(&node->data, len, stream); + + node = node->next; + if (!node) + { + /* append end tag */ + len = 0; + WRITE_BINARY(len, stream); + } + + } while (node != NULL); + } + + static RegisNode * + readRegisNode(FILE *stream) + { + int len; + RegisNode *result = NULL; + RegisNode *node, + *prev = NULL; + + do + { + READ_BINARY(len, stream); + if (len > 0) + { + uint32 aux; + + node = (RegisNode *) palloc0(RNHDRSZ + len + 1); + if (result == NULL) + result = node; + else + prev->next = node; + + READ_BINARY(aux, stream); + node->type = aux & 3; + node->len = aux >> 2 & 65535; + READ_BINARY_STRING(node->data, len, stream); + prev = node; + } + } while (len > 0); + + return result; + } + + static void + outRegis(FILE *stream, Regis *regis) + { + uint32 aux = regis->issuffix | regis->nchar << 1; + + WRITE_BINARY(aux, stream); + outRegisNode(stream, regis->node); + } + + static void + readRegis(FILE *stream, Regis *regis) + { + uint32 aux; + + READ_BINARY(aux, stream); + regis->issuffix = aux & 1; + regis->nchar = aux >> 1 & 65535; + regis->node = readRegisNode(stream); + } + + static void + outAFFIX(FILE *stream, AFFIX *aff) + { + uint32 aux = aff->flag | aff->type << 8 | aff->flagflags << 9 | + aff->issimple << 16 | aff->isregis << 17 | aff->replen << 18; + + WRITE_BINARY(aux, stream); + WRITE_STRING(aff->find, stream); + WRITE_STRING(aff->repl, stream); + + if (aff->isregis) + outRegis(stream, &aff->reg.regis); + } + + static void + readAFFIX(FILE *stream, AFFIX *aff) + { + uint32 aux; + + checkTmpCtx(); + + READ_BINARY(aux, stream); + aff->flag = aux & 255; + aff->type = aux >> 8 & 1; + aff->flagflags = aux >> 9 & 127; + aff->issimple = aux >> 16 & 1; + aff->isregis = aux >> 17 & 1; + aff->replen = (aux >> 18) & 16383; + + READ_STRING(aff->find, stream); + READ_STRING(aff->repl, stream); + + if (aff->isregis) + readRegis(stream, &aff->reg.regis); + } + + static void + outAffix(FILE *stream, IspellDict *Conf) + { + int i; + + WRITE_BINARY(Conf->naffixes, stream); + for (i = 0; i < Conf->naffixes; i++) + { + outAFFIX(stream, &Conf->Affix[i]); + } + } + + void + readAffix(FILE *stream, IspellDict *Conf) + { + int i; + + READ_BINARY(Conf->naffixes, stream); + + Conf->Affix = (AFFIX *) palloc(Conf->naffixes * sizeof(AFFIX)); + for (i = 0; i < Conf->naffixes; i++) + { + readAFFIX(stream, &Conf->Affix[i]); + } + } + static void checkTmpCtx(void) { *************** *** 63,68 **** --- 394,424 ---- return dst; } + void + outStopList(FILE *stream, StopList *s) + { + int i; + + WRITE_BINARY(s->len, stream); + for (i = 0; i < s->len; i++) + { + WRITE_STRING(s->stop[i], stream); + } + } + + void + readStopList(FILE *stream, StopList *s) + { + int i; + + READ_BINARY(s->len, stream); + s->stop = (char **) palloc(s->len * sizeof(char *)); + for(i = 0; i < s->len; i++) + { + READ_STRING(s->stop[i], stream); + } + } + #define MAX_NORM 1024 #define MAXNORMLEN 256 *************** *** 252,258 **** tsearch_readline_end(&trst); } - static int FindWord(IspellDict *Conf, const char *word, int affixflag, int flag) { --- 608,613 ---- *************** *** 261,266 **** --- 616,623 ---- *StopHigh, *StopMiddle; uint8 *ptr = (uint8 *) word; + static int xx = 0; + flag &= FF_DICTFLAGMASK; *************** *** 270,276 **** --- 627,635 ---- StopHigh = node->data + node->length; while (StopLow < StopHigh) { + StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); + if (StopMiddle->val == *ptr) { if (*(ptr + 1) == '\0' && StopMiddle->isword) *************** *** 321,326 **** --- 680,686 ---- } Affix = Conf->Affix + Conf->naffixes; + Affix->mask = pstrdup(mask); if (strcmp(mask, ".") == 0) { *************** *** 878,884 **** if (!nchar) return NULL; ! rs = (SPNode *) palloc0(SPNHDRSZ + nchar * sizeof(SPNodeData)); rs->length = nchar; data = rs->data; --- 1238,1244 ---- if (!nchar) return NULL; ! rs = allocSPNode(nchar); rs->length = nchar; data = rs->data; *************** *** 987,992 **** --- 1347,1358 ---- Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0); Conf->Spell = NULL; + + /* serialize a dictionary */ + if (Conf->stream && Conf->mode == 'w') + { + outSPDict(Conf->stream, Conf); + } } static AffixNode * *************** *** 1000,1012 **** int lownew = low; int naff; AFFIX **aff; ! for (i = low; i < high; i++) if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type)) { nchar++; lastchar = GETCHAR(Conf->Affix + i, level, type); } if (!nchar) return NULL; --- 1366,1380 ---- int lownew = low; int naff; AFFIX **aff; ! for (i = low; i < high; i++) + { if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type)) { nchar++; lastchar = GETCHAR(Conf->Affix + i, level, type); } + } if (!nchar) return NULL; *************** *** 1092,1097 **** --- 1460,1466 ---- return; Affix->data->aff = (AFFIX **) palloc(sizeof(AFFIX *) * cnt); + Affix->data->naff = (uint32) cnt; cnt = 0; *************** *** 1130,1135 **** --- 1499,1555 ---- if (Conf->naffixes > 1) qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix); + + /* Serialize affix */ + if (Conf->stream && Conf->mode == 'w') + { + outAffix(Conf->stream, Conf); + } + + Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes); + ptr->affix = NULL; + + for (i = 0; i < Conf->naffixes; i++) + { + Affix = &(((AFFIX *) Conf->Affix)[i]); + if (Affix->type == FF_SUFFIX && i < firstsuffix) + firstsuffix = i; + + if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 && + isAffixInUse(Conf, (char) Affix->flag)) + { + if (ptr == Conf->CompoundAffix || + ptr->issuffix != (ptr - 1)->issuffix || + strbncmp((const unsigned char *) (ptr - 1)->affix, + (const unsigned char *) Affix->repl, + (ptr - 1)->len)) + { + /* leave only unique and minimals suffixes */ + ptr->affix = Affix->repl; + ptr->len = Affix->replen; + ptr->issuffix = (Affix->type == FF_SUFFIX) ? true : false; + ptr++; + } + } + } + ptr->affix = NULL; + Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1)); + + Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX); + Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX); + mkVoidAffix(Conf, true, firstsuffix); + mkVoidAffix(Conf, false, firstsuffix); + } + + + void + postProcessAffixes(IspellDict *Conf) + { + AFFIX *Affix; + size_t i; + CMPDAffix *ptr; + int firstsuffix = Conf->naffixes; + Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes); ptr->affix = NULL; *************** *** 1172,1177 **** --- 1592,1598 ---- *StopHigh, *StopMiddle; uint8 symbol; + static int xx = 0; if (node->isvoid) { /* search void affixes */ *************** *** 1188,1199 **** { StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); symbol = GETWCHAR(word, wrdlen, *level, type); ! if (StopMiddle->val == symbol) { (*level)++; if (StopMiddle->naff) return StopMiddle; node = StopMiddle->node; break; } --- 1609,1622 ---- { StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); symbol = GETWCHAR(word, wrdlen, *level, type); ! if (StopMiddle->val == symbol) { (*level)++; if (StopMiddle->naff) + { return StopMiddle; + } node = StopMiddle->node; break; } *************** *** 1372,1378 **** while (snode) { int baselen = 0; - /* find possible suffix */ suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX); if (!suffix) --- 1795,1800 ---- *************** *** 1402,1408 **** /* prefix success */ int ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ? 0 : prefix->aff[j]->flag; - if (FindWord(Conf, pnewword, ff, flag)) cur += addToResult(forms, cur, pnewword); } --- 1824,1829 ---- *************** *** 1420,1425 **** --- 1841,1849 ---- pfree(forms); return (NULL); } + + cur = forms; + return (forms); } *** ./src/include/tsearch/dicts/spell.h.orig 2010-08-31 23:46:38.653669628 +0200 --- ./src/include/tsearch/dicts/spell.h 2010-08-31 23:46:47.469669487 +0200 *************** *** 161,166 **** --- 161,168 ---- unsigned char flagval[256]; bool usecompound; + FILE *stream; + char mode; } IspellDict; extern TSLexeme *NINormalizeWord(IspellDict *Conf, char *word); *** ./src/include/tsearch/ts_public.h.orig 2010-01-02 17:58:09.000000000 +0100 --- ./src/include/tsearch/ts_public.h 2010-08-31 23:46:00.185669425 +0200 *************** *** 78,83 **** --- 78,87 ---- char *(*wordop) (const char *)); extern bool searchstoplist(StopList *s, char *key); + extern void outStopList(FILE *stream, StopList *s); + extern void readStopList(FILE *stream, StopList *s); + + /* * Interface with dictionaries */
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers