"Kevin Grittner" <kevin.gritt...@wicourts.gov> writes:
>      reserved    = gen-delims / sub-delims
>      gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
>      sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
>                  / "*" / "+" / "," / ";" / "="
>      unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
 
> I think that we should accept all the above characters (reserved and
> unreserved) and the percent character (since it is the escape
> character) as part of a URL.

I've applied the attached patch to make it work that way.

                        regards, tom lane

Index: src/backend/tsearch/wparser_def.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/tsearch/wparser_def.c,v
retrieving revision 1.29
diff -c -r1.29 wparser_def.c
*** src/backend/tsearch/wparser_def.c	26 Apr 2010 17:10:18 -0000	1.29
--- src/backend/tsearch/wparser_def.c	28 Apr 2010 01:57:14 -0000
***************
*** 583,588 ****
--- 583,617 ----
  	return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
  }
  
+ static int
+ p_isurlchar(TParser *prs)
+ {
+ 	char		ch;
+ 
+ 	/* no non-ASCII need apply */
+ 	if (prs->state->charlen != 1)
+ 		return 0;
+ 	ch = *(prs->str + prs->state->posbyte);
+ 	/* no spaces or control characters */
+ 	if (ch <= 0x20 || ch >= 0x7F)
+ 		return 0;
+ 	/* reject characters disallowed by RFC 3986 */
+ 	switch (ch)
+ 	{
+ 		case '"':
+ 		case '<':
+ 		case '>':
+ 		case '\\':
+ 		case '^':
+ 		case '`':
+ 		case '{':
+ 		case '|':
+ 		case '}':
+ 			return 0;
+ 	}
+ 	return 1;
+ }
+ 
  
  /* deliberately suppress unused-function complaints for the above */
  void		_make_compiler_happy(void);
***************
*** 707,715 ****
  	int			res = 0;
  
  	tmpprs->state = newTParserPosition(tmpprs->state);
! 	tmpprs->state->state = TPS_InFileFirst;
  
! 	if (TParserGet(tmpprs) && (tmpprs->type == URLPATH || tmpprs->type == FILEPATH))
  	{
  		prs->state->posbyte += tmpprs->lenbytetoken;
  		prs->state->poschar += tmpprs->lenchartoken;
--- 736,744 ----
  	int			res = 0;
  
  	tmpprs->state = newTParserPosition(tmpprs->state);
! 	tmpprs->state->state = TPS_InURLPathFirst;
  
! 	if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
  	{
  		prs->state->posbyte += tmpprs->lenbytetoken;
  		prs->state->poschar += tmpprs->lenchartoken;
***************
*** 1441,1447 ****
  	{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
  	{p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
  	{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
- 	{p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
  	{p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
  	{NULL, 0, A_POP, TPS_Null, 0, NULL}
  };
--- 1470,1475 ----
***************
*** 1488,1494 ****
  	{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
  	{p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
  	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
- 	{p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
  	{NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
  };
  
--- 1516,1521 ----
***************
*** 1502,1510 ****
  
  static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
  	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
! 	{p_iseqC, '"', A_POP, TPS_Null, 0, NULL},
! 	{p_iseqC, '\'', A_POP, TPS_Null, 0, NULL},
! 	{p_isnotspace, 0, A_CLEAR, TPS_InURLPath, 0, NULL},
  	{NULL, 0, A_POP, TPS_Null, 0, NULL},
  };
  
--- 1529,1535 ----
  
  static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
  	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
! 	{p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
  	{NULL, 0, A_POP, TPS_Null, 0, NULL},
  };
  
***************
*** 1514,1522 ****
  
  static const TParserStateActionItem actionTPS_InURLPath[] = {
  	{p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
! 	{p_iseqC, '"', A_BINGO, TPS_Base, URLPATH, NULL},
! 	{p_iseqC, '\'', A_BINGO, TPS_Base, URLPATH, NULL},
! 	{p_isnotspace, 0, A_NEXT, TPS_InURLPath, 0, NULL},
  	{NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
  };
  
--- 1539,1545 ----
  
  static const TParserStateActionItem actionTPS_InURLPath[] = {
  	{p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
! 	{p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
  	{NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
  };
  
Index: src/test/regress/expected/tsearch.out
===================================================================
RCS file: /cvsroot/pgsql/src/test/regress/expected/tsearch.out,v
retrieving revision 1.17
diff -c -r1.17 tsearch.out
*** src/test/regress/expected/tsearch.out	22 Nov 2009 05:20:41 -0000	1.17
--- src/test/regress/expected/tsearch.out	28 Apr 2010 01:57:14 -0000
***************
*** 287,294 ****
       6 | 4aew.werc.ewr
      12 |  
      14 | http://
       6 | 5aew.werc.ewr:8100
!     12 | /?  
       1 | ad
      12 | =
       1 | qwe
--- 287,296 ----
       6 | 4aew.werc.ewr
      12 |  
      14 | http://
+      5 | 5aew.werc.ewr:8100/?
       6 | 5aew.werc.ewr:8100
!     18 | /?
!     12 |   
       1 | ad
      12 | =
       1 | qwe
***************
*** 391,404 ****
      12 |  
      12 | <> 
       1 | qwerty
! (131 rows)
  
  SELECT to_tsvector('english', '345 q...@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teo...@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
  /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
  <i <b> wow  < jqw <> qwerty');
!                                                                                                                                                                                                                                                                                                                                                                                                                       to_tsvector                                                                                                                                                                                                                                                                                                                                                                                                                       
! --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
!  '+4.0e-10':26 '-4.2':58,60 '/?ad=qwe&dw':7,10,14,22 '/?ad=qwe&dw=%20%32':25 '/awdf/dwqe/4325':46 '/usr/local/fff':45 '/wqe-324/ewr':49 '1aew.werc.ewr':9 '1aew.werc.ewr/?ad=qwe&dw':8 '234':61 '234.435':30 '2aew.werc.ewr':11 '345':1 '3aew.werc.ewr':13 '3aew.werc.ewr/?ad=qwe&dw':12 '4.2':54,55,56 '455':31 '4aew.werc.ewr':15 '5.005':32 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100':24 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23 'ad':17 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':5 'asdf':37 'dw':19 'efd.r':3 'ewr1':43 'ewri2':44 'gist.c':52 'gist.h':50 'gist.h.c':51 'hjwer':42 'jf':39 'jqw':64 'qwe':2,18,27,28,35 'qwe-wer':34 'qwer':38 'qwerti':65 'qwqwe':29 'readlin':53,57,59 'rewt/ewr':47 'sdjk':40 'teo...@stack.net':33 'wefjn':48 'wer':36 'wow':63 'www.com':4
  (1 row)
  
  SELECT length(to_tsvector('english', '345 q...@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teo...@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
--- 393,406 ----
      12 |  
      12 | <> 
       1 | qwerty
! (133 rows)
  
  SELECT to_tsvector('english', '345 q...@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teo...@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
  /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
  <i <b> wow  < jqw <> qwerty');
!                                                                                                                                                                                                                                                                                                                                                                                                                                        to_tsvector                                                                                                                                                                                                                                                                                                                                                                                                                                        
! ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
!  '+4.0e-10':28 '-4.2':60,62 '/?':18 '/?ad=qwe&dw':7,10,14,24 '/?ad=qwe&dw=%20%32':27 '/awdf/dwqe/4325':48 '/usr/local/fff':47 '/wqe-324/ewr':51 '1aew.werc.ewr':9 '1aew.werc.ewr/?ad=qwe&dw':8 '234':63 '234.435':32 '2aew.werc.ewr':11 '345':1 '3aew.werc.ewr':13 '3aew.werc.ewr/?ad=qwe&dw':12 '4.2':56,57,58 '455':33 '4aew.werc.ewr':15 '5.005':34 '5aew.werc.ewr:8100':17 '5aew.werc.ewr:8100/?':16 '6aew.werc.ewr:8100':23 '6aew.werc.ewr:8100/?ad=qwe&dw':22 '7aew.werc.ewr:8100':26 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':25 'ad':19 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':5 'asdf':39 'dw':21 'efd.r':3 'ewr1':45 'ewri2':46 'gist.c':54 'gist.h':52 'gist.h.c':53 'hjwer':44 'jf':41 'jqw':66 'qwe':2,20,29,30,37 'qwe-wer':36 'qwer':40 'qwerti':67 'qwqwe':31 'readlin':55,59,61 'rewt/ewr':49 'sdjk':42 'teo...@stack.net':35 'wefjn':50 'wer':38 'wow':65 'www.com':4
  (1 row)
  
  SELECT length(to_tsvector('english', '345 q...@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teo...@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
***************
*** 406,412 ****
  <i <b> wow  < jqw <> qwerty'));
   length 
  --------
!      51
  (1 row)
  
  -- ts_debug
--- 408,414 ----
  <i <b> wow  < jqw <> qwerty'));
   length 
  --------
!      53
  (1 row)
  
  -- ts_debug
***************
*** 424,429 ****
--- 426,469 ----
   tag       | XML tag         | </myns:foo-bar_baz.blurfl> | {}             |              | 
  (9 rows)
  
+ -- check parsing of URLs
+ SELECT * from ts_debug('english', 'http://www.harewoodsolutions.co.uk/press.aspx</span>');
+   alias   |  description  |                 token                  | dictionaries | dictionary |                 lexemes                  
+ ----------+---------------+----------------------------------------+--------------+------------+------------------------------------------
+  protocol | Protocol head | http://                                | {}           |            | 
+  url      | URL           | www.harewoodsolutions.co.uk/press.aspx | {simple}     | simple     | {www.harewoodsolutions.co.uk/press.aspx}
+  host     | Host          | www.harewoodsolutions.co.uk            | {simple}     | simple     | {www.harewoodsolutions.co.uk}
+  url_path | URL path      | /press.aspx                            | {simple}     | simple     | {/press.aspx}
+  tag      | XML tag       | </span>                                | {}           |            | 
+ (5 rows)
+ 
+ SELECT * from ts_debug('english', 'http://aew.wer0c.ewr/id?ad=qwe&dw<span>');
+   alias   |  description  |           token            | dictionaries | dictionary |           lexemes            
+ ----------+---------------+----------------------------+--------------+------------+------------------------------
+  protocol | Protocol head | http://                    | {}           |            | 
+  url      | URL           | aew.wer0c.ewr/id?ad=qwe&dw | {simple}     | simple     | {aew.wer0c.ewr/id?ad=qwe&dw}
+  host     | Host          | aew.wer0c.ewr              | {simple}     | simple     | {aew.wer0c.ewr}
+  url_path | URL path      | /id?ad=qwe&dw              | {simple}     | simple     | {/id?ad=qwe&dw}
+  tag      | XML tag       | <span>                     | {}           |            | 
+ (5 rows)
+ 
+ SELECT * from ts_debug('english', 'http://5aew.werc.ewr:8100/?');
+   alias   |  description  |        token         | dictionaries | dictionary |        lexemes         
+ ----------+---------------+----------------------+--------------+------------+------------------------
+  protocol | Protocol head | http://              | {}           |            | 
+  url      | URL           | 5aew.werc.ewr:8100/? | {simple}     | simple     | {5aew.werc.ewr:8100/?}
+  host     | Host          | 5aew.werc.ewr:8100   | {simple}     | simple     | {5aew.werc.ewr:8100}
+  url_path | URL path      | /?                   | {simple}     | simple     | {/?}
+ (4 rows)
+ 
+ SELECT * from ts_debug('english', '5aew.werc.ewr:8100/?xx');
+   alias   | description |         token          | dictionaries | dictionary |         lexemes          
+ ----------+-------------+------------------------+--------------+------------+--------------------------
+  url      | URL         | 5aew.werc.ewr:8100/?xx | {simple}     | simple     | {5aew.werc.ewr:8100/?xx}
+  host     | Host        | 5aew.werc.ewr:8100     | {simple}     | simple     | {5aew.werc.ewr:8100}
+  url_path | URL path    | /?xx                   | {simple}     | simple     | {/?xx}
+ (3 rows)
+ 
  -- to_tsquery
  SELECT to_tsquery('english', 'qwe & sKies ');
    to_tsquery   
Index: src/test/regress/sql/tsearch.sql
===================================================================
RCS file: /cvsroot/pgsql/src/test/regress/sql/tsearch.sql,v
retrieving revision 1.11
diff -c -r1.11 tsearch.sql
*** src/test/regress/sql/tsearch.sql	19 May 2009 02:48:26 -0000	1.11
--- src/test/regress/sql/tsearch.sql	28 Apr 2010 01:57:14 -0000
***************
*** 105,110 ****
--- 105,116 ----
  
  SELECT * from ts_debug('english', '<myns:foo-bar_baz.blurfl>abc&nm1;def&#xa9;ghi&#245;jkl</myns:foo-bar_baz.blurfl>');
  
+ -- check parsing of URLs
+ SELECT * from ts_debug('english', 'http://www.harewoodsolutions.co.uk/press.aspx</span>');
+ SELECT * from ts_debug('english', 'http://aew.wer0c.ewr/id?ad=qwe&dw<span>');
+ SELECT * from ts_debug('english', 'http://5aew.werc.ewr:8100/?');
+ SELECT * from ts_debug('english', '5aew.werc.ewr:8100/?xx');
+ 
  -- to_tsquery
  
  SELECT to_tsquery('english', 'qwe & sKies ');
-- 
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

Reply via email to