Hi, While playing around/evaluating tsearch I notices that to_tsvector is obscenely slow for some files. After some profiling I found that this is due using a seperate TSParser in p_ishost/p_isURLPath in wparser_def.c. If a multibyte encoding is in use TParserInit copies the whole remaining input and converts it to wchar_t or pg_wchar - for every email or protocol prefixed url in the the document. Which obviously is bad.
I solved the issue by having a seperate TParserCopyInit/TParserCopyClose which reuses the the already converted strings of the original TParser - only at different offsets. Another approach would be to get rid of the separate parser invocations - requiring a bunch of additional states. This seemed more complex to me, so I wanted to get some feedback first. Without patch: andres=# SELECT to_tsvector('english', document) FROM document WHERE filename = '/usr/share/doc/libdrm-nouveau1/changelog'; ───────────────────────────────────────────────────────────────────────────────────────────────────── ... (1 row) Time: 5835.676 ms With patch: andres=# SELECT to_tsvector('english', document) FROM document WHERE filename = '/usr/share/doc/libdrm-nouveau1/changelog'; ───────────────────────────────────────────────────────────────────────────────────────────────────── ... (1 row) Time: 395.341 ms Ill cleanup the patch if it seems like a sensible solution... Is this backpatch-worthy? Andres PS: I let the additional define in for the moment so that its easier to see the performance differences.
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 301c1eb..14757d9 100644 *** a/src/backend/tsearch/wparser_def.c --- b/src/backend/tsearch/wparser_def.c *************** *** 24,29 **** --- 24,30 ---- /* Define me to enable tracing of parser behavior */ /* #define WPARSER_TRACE */ + #define WPARSER_REUSE /* Output token categories */ *************** TParserInit(char *str, int len) *** 328,333 **** --- 329,358 ---- return prs; } + static TParser * + TParserCopyInit(const TParser const* orig, size_t offset, int len) + { + TParser *prs = (TParser *) palloc0(sizeof(TParser)); + + prs->charmaxlen = orig->charmaxlen; + prs->usewide = orig->usewide; + prs->lenstr = len; + + prs->str = orig->str + offset; + prs->pgwstr = orig->pgwstr + offset; + prs->wstr = orig->wstr + offset; + + + prs->state = newTParserPosition(NULL); + prs->state->state = TPS_Base; + + #ifdef WPARSER_TRACE + fprintf(stderr, "parsing \"%.*s\"\n", len, orig->str + offset); + #endif + + return prs; + } + static void TParserClose(TParser *prs) { *************** TParserClose(TParser *prs) *** 345,351 **** --- 370,388 ---- if (prs->pgwstr) pfree(prs->pgwstr); #endif + pfree(prs); + } + static void + TParserCopyClose(TParser *prs) + { + while (prs->state) + { + TParserPosition *ptr = prs->state->prev; + + pfree(prs->state); + prs->state = ptr; + } pfree(prs); } *************** p_isignore(TParser *prs) *** 617,623 **** static int p_ishost(TParser *prs) { ! TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte); int res = 0; tmpprs->wanthost = true; --- 654,664 ---- static int p_ishost(TParser *prs) { ! #ifdef WPARSER_REUSE ! TParser *tmpprs = TParserCopyInit(prs, prs->state->posbyte, prs->lenstr - prs->state->posbyte); ! #else ! TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte); ! #endif int res = 0; tmpprs->wanthost = true; *************** p_ishost(TParser *prs) *** 631,637 **** --- 672,682 ---- prs->state->charlen = tmpprs->state->charlen; res = 1; } + #ifdef WPARSER_REUSE + TParserCopyClose(tmpprs); + #else TParserClose(tmpprs); + #endif return res; } *************** p_ishost(TParser *prs) *** 639,645 **** static int p_isURLPath(TParser *prs) { ! TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte); int res = 0; tmpprs->state = newTParserPosition(tmpprs->state); --- 684,694 ---- static int p_isURLPath(TParser *prs) { ! #ifdef WPARSER_REUSE ! TParser *tmpprs = TParserCopyInit(prs, prs->state->posbyte, prs->lenstr - prs->state->posbyte); ! #else ! TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte); ! #endif int res = 0; tmpprs->state = newTParserPosition(tmpprs->state); *************** p_isURLPath(TParser *prs) *** 654,660 **** --- 703,713 ---- prs->state->charlen = tmpprs->state->charlen; res = 1; } + #ifdef WPARSER_REUSE + TParserCopyClose(tmpprs); + #else TParserClose(tmpprs); + #endif return res; }
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers