Hi,

While playing around/evaluating tsearch I notices that to_tsvector is 
obscenely slow for some files. After some profiling I found that this is due 
using a seperate TSParser in p_ishost/p_isURLPath in wparser_def.c.
If a multibyte encoding is in use TParserInit copies the whole remaining input 
and converts it to wchar_t or pg_wchar - for every email or protocol prefixed 
url in the the document. Which obviously is bad.

I solved the issue by having a seperate TParserCopyInit/TParserCopyClose which 
reuses the the already converted strings of the original TParser - only at 
different offsets.

Another approach would be to get rid of the separate parser invocations - 
requiring a bunch of additional states. This seemed more complex to me, so I 
wanted to get some feedback first.

Without patch:
andres=# SELECT to_tsvector('english', document) FROM document WHERE filename = 
'/usr/share/doc/libdrm-nouveau1/changelog';
                                                                                
                      
 
─────────────────────────────────────────────────────────────────────────────────────────────────────
...
 (1 row)
 
Time: 5835.676 ms

With patch:
andres=# SELECT to_tsvector('english', document) FROM document WHERE filename = 
'/usr/share/doc/libdrm-nouveau1/changelog';
                                                                                
                      
 
─────────────────────────────────────────────────────────────────────────────────────────────────────
...
 (1 row)
 
Time: 395.341 ms

Ill cleanup the patch if it seems like a sensible solution...

Is this backpatch-worthy?

Andres


PS: I let the additional define in for the moment so that its easier to see the 
performance differences.
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index 301c1eb..14757d9 100644
*** a/src/backend/tsearch/wparser_def.c
--- b/src/backend/tsearch/wparser_def.c
***************
*** 24,29 ****
--- 24,30 ----
  
  /* Define me to enable tracing of parser behavior */
  /* #define WPARSER_TRACE */
+ #define WPARSER_REUSE
  
  
  /* Output token categories */
*************** TParserInit(char *str, int len)
*** 328,333 ****
--- 329,358 ----
  	return prs;
  }
  
+ static TParser *
+ TParserCopyInit(const TParser const* orig, size_t offset, int len)
+ {
+ 	TParser    *prs = (TParser *) palloc0(sizeof(TParser));
+ 
+ 	prs->charmaxlen = orig->charmaxlen;
+ 	prs->usewide = orig->usewide;
+ 	prs->lenstr = len;
+ 
+ 	prs->str = orig->str + offset;
+ 	prs->pgwstr = orig->pgwstr + offset;
+ 	prs->wstr = orig->wstr + offset;
+ 
+ 
+ 	prs->state = newTParserPosition(NULL);
+ 	prs->state->state = TPS_Base;
+ 
+ #ifdef WPARSER_TRACE
+ 	fprintf(stderr, "parsing \"%.*s\"\n", len, orig->str + offset);
+ #endif
+ 
+ 	return prs;
+ }
+ 
  static void
  TParserClose(TParser *prs)
  {
*************** TParserClose(TParser *prs)
*** 345,351 ****
--- 370,388 ----
  	if (prs->pgwstr)
  		pfree(prs->pgwstr);
  #endif
+ 	pfree(prs);
+ }
  
+ static void
+ TParserCopyClose(TParser *prs)
+ {
+ 	while (prs->state)
+ 	{
+ 		TParserPosition *ptr = prs->state->prev;
+ 
+ 		pfree(prs->state);
+ 		prs->state = ptr;
+ 	}
  	pfree(prs);
  }
  
*************** p_isignore(TParser *prs)
*** 617,623 ****
  static int
  p_ishost(TParser *prs)
  {
! 	TParser    *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
  	int			res = 0;
  
  	tmpprs->wanthost = true;
--- 654,664 ----
  static int
  p_ishost(TParser *prs)
  {
! #ifdef WPARSER_REUSE
! 	TParser *tmpprs = TParserCopyInit(prs, prs->state->posbyte, prs->lenstr - prs->state->posbyte);
! #else
! 	TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
! #endif
  	int			res = 0;
  
  	tmpprs->wanthost = true;
*************** p_ishost(TParser *prs)
*** 631,637 ****
--- 672,682 ----
  		prs->state->charlen = tmpprs->state->charlen;
  		res = 1;
  	}
+ #ifdef WPARSER_REUSE
+ 	TParserCopyClose(tmpprs);
+ #else
  	TParserClose(tmpprs);
+ #endif
  
  	return res;
  }
*************** p_ishost(TParser *prs)
*** 639,645 ****
  static int
  p_isURLPath(TParser *prs)
  {
! 	TParser    *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
  	int			res = 0;
  
  	tmpprs->state = newTParserPosition(tmpprs->state);
--- 684,694 ----
  static int
  p_isURLPath(TParser *prs)
  {
! #ifdef WPARSER_REUSE
! 	TParser *tmpprs = TParserCopyInit(prs, prs->state->posbyte, prs->lenstr - prs->state->posbyte);
! #else
! 	TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
! #endif
  	int			res = 0;
  
  	tmpprs->state = newTParserPosition(tmpprs->state);
*************** p_isURLPath(TParser *prs)
*** 654,660 ****
--- 703,713 ----
  		prs->state->charlen = tmpprs->state->charlen;
  		res = 1;
  	}
+ #ifdef WPARSER_REUSE
+ 	TParserCopyClose(tmpprs);
+ #else
  	TParserClose(tmpprs);
+ #endif
  
  	return res;
  }
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to