Yeah, it's a workaround. Since there's no concept other than
alpha/numeric/latin in tsearch2, Asian characters have to be fall in
one of them.
Ok, I see.
Pls, test attached patch - if it is good then I'll commit it at Monday to HEAD
and 8.2 branches.
PS. Magnus, may I ask you to test under Windows? Thank you.
--
Teodor Sigaev E-mail: [EMAIL PROTECTED]
WWW: http://www.sigaev.ru/
diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c
*** ../tsearch2.orig/ts_locale.c Fri Jan 12 10:53:11 2007
--- ./ts_locale.c Fri Jan 12 18:10:27 2007
***************
*** 12,24 ****
size_t
wchar2char(char *to, const wchar_t *from, size_t len)
{
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
- if (len == 0)
- return 0;
-
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
NULL, NULL);
--- 12,24 ----
size_t
wchar2char(char *to, const wchar_t *from, size_t len)
{
+ if (len == 0)
+ return 0;
+
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
NULL, NULL);
***************
*** 34,50 ****
return wcstombs(to, from, len);
}
size_t
char2wchar(wchar_t *to, const char *from, size_t len)
{
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
- if (len == 0)
- return 0;
-
r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
if (!r)
--- 34,52 ----
return wcstombs(to, from, len);
}
+ #endif /* WIN32 */
size_t
char2wchar(wchar_t *to, const char *from, size_t len)
{
+ if (len == 0)
+ return 0;
+
+ #ifdef WIN32
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
if (!r)
***************
*** 60,88 ****
return r;
}
return mbstowcs(to, from, len);
}
- #endif /* WIN32 */
int
_t_isalpha(const char *ptr)
{
! wchar_t character;
! char2wchar(&character, ptr, 1);
! return iswalpha((wint_t) character);
}
int
_t_isprint(const char *ptr)
{
! wchar_t character;
! char2wchar(&character, ptr, 1);
! return iswprint((wint_t) character);
}
#endif /* TS_USE_WIDE */
--- 62,105 ----
return r;
}
+ else
+ #endif /* WIN32 */
+ if ( lc_ctype_is_c() )
+ {
+ /*
+ * pg_mb2wchar_with_len always adds trailing '\0', so
+ * 'to' should be allocated with sufficient space
+ */
+ return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+ }
return mbstowcs(to, from, len);
}
int
_t_isalpha(const char *ptr)
{
! wchar_t character[2];
!
! if (lc_ctype_is_c())
! return isalpha(TOUCHAR(ptr));
! char2wchar(character, ptr, 1);
! return iswalpha((wint_t) *character);
}
int
_t_isprint(const char *ptr)
{
! wchar_t character[2];
!
! if (lc_ctype_is_c())
! return isprint(TOUCHAR(ptr));
! char2wchar(character, ptr, 1);
! return iswprint((wint_t) *character);
}
#endif /* TS_USE_WIDE */
***************
*** 126,132 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from
server encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
--- 143,149 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from server
encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
***************
*** 152,158 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from
wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
--- 169,175 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from
wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h
*** ../tsearch2.orig/ts_locale.h Fri Jan 12 10:53:11 2007
--- ./ts_locale.h Fri Jan 12 18:10:19 2007
***************
*** 30,45 ****
#define TOUCHAR(x) (*((unsigned char*)(x)))
#ifdef TS_USE_WIDE
#ifdef WIN32
size_t wchar2char(char *to, const wchar_t *from, size_t len);
! size_t char2wchar(wchar_t *to, const char *from, size_t len);
#else /* WIN32 */
! /* correct mbstowcs */
! #define char2wchar mbstowcs
#define wchar2char wcstombs
#endif /* WIN32 */
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
--- 30,46 ----
#define TOUCHAR(x) (*((unsigned char*)(x)))
#ifdef TS_USE_WIDE
+ size_t char2wchar(wchar_t *to, const char *from, size_t len);
#ifdef WIN32
size_t wchar2char(char *to, const wchar_t *from, size_t len);
!
#else /* WIN32 */
! /* correct wcstombs */
#define wchar2char wcstombs
+
#endif /* WIN32 */
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 55,64 ****
*/
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c))
) : false )
! #define COPYCHAR(d,s) do { \
! int lll = pg_mblen( s ); \
! \
! while( lll-- ) \
TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
} while(0)
--- 56,65 ----
*/
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c))
) : false )
! #define COPYCHAR(d,s) do { \
! int lll = pg_mblen( s ); \
!
\
! while( lll-- )
\
TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
} while(0)
diff -c -r -N ../tsearch2.orig/tsearch2.patch ./tsearch2.patch
*** ../tsearch2.orig/tsearch2.patch Thu Jan 1 03:00:00 1970
--- ./tsearch2.patch Fri Jan 12 18:12:30 2007
***************
*** 0 ****
--- 1,243 ----
+ diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c
+ *** ../tsearch2.orig/ts_locale.c Fri Jan 12 10:53:11 2007
+ --- ./ts_locale.c Fri Jan 12 18:10:27 2007
+ ***************
+ *** 12,24 ****
+ size_t
+ wchar2char(char *to, const wchar_t *from, size_t len)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ int r;
+
+ - if (len == 0)
+ - return 0;
+ -
+ r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+ NULL, NULL);
+
+ --- 12,24 ----
+ size_t
+ wchar2char(char *to, const wchar_t *from, size_t len)
+ {
+ + if (len == 0)
+ + return 0;
+ +
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ int r;
+
+ r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+ NULL, NULL);
+
+ ***************
+ *** 34,50 ****
+
+ return wcstombs(to, from, len);
+ }
+
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ int r;
+
+ - if (len == 0)
+ - return 0;
+ -
+ r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+
+ if (!r)
+ --- 34,52 ----
+
+ return wcstombs(to, from, len);
+ }
+ + #endif /* WIN32 */
+
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+ + if (len == 0)
+ + return 0;
+ +
+ + #ifdef WIN32
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ int r;
+
+ r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+
+ if (!r)
+ ***************
+ *** 60,88 ****
+
+ return r;
+ }
+
+ return mbstowcs(to, from, len);
+ }
+ - #endif /* WIN32 */
+
+ int
+ _t_isalpha(const char *ptr)
+ {
+ ! wchar_t character;
+
+ ! char2wchar(&character, ptr, 1);
+
+ ! return iswalpha((wint_t) character);
+ }
+
+ int
+ _t_isprint(const char *ptr)
+ {
+ ! wchar_t character;
+
+ ! char2wchar(&character, ptr, 1);
+
+ ! return iswprint((wint_t) character);
+ }
+ #endif /* TS_USE_WIDE */
+
+ --- 62,105 ----
+
+ return r;
+ }
+ + else
+ + #endif /* WIN32 */
+ + if ( lc_ctype_is_c() )
+ + {
+ + /*
+ + * pg_mb2wchar_with_len always adds trailing '\0', so
+ + * 'to' should be allocated with sufficient space
+ + */
+ + return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+ + }
+
+ return mbstowcs(to, from, len);
+ }
+
+ int
+ _t_isalpha(const char *ptr)
+ {
+ ! wchar_t character[2];
+ !
+ ! if (lc_ctype_is_c())
+ ! return isalpha(TOUCHAR(ptr));
+
+ ! char2wchar(character, ptr, 1);
+
+ ! return iswalpha((wint_t) *character);
+ }
+
+ int
+ _t_isprint(const char *ptr)
+ {
+ ! wchar_t character[2];
+ !
+ ! if (lc_ctype_is_c())
+ ! return isprint(TOUCHAR(ptr));
+
+ ! char2wchar(character, ptr, 1);
+
+ ! return iswprint((wint_t) *character);
+ }
+ #endif /* TS_USE_WIDE */
+
+ ***************
+ *** 126,132 ****
+ if ( wlen < 0 )
+ ereport(ERROR,
+
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! errmsg("transalation failed from
server encoding to wchar_t")));
+
+ Assert(wlen<=len);
+ wstr[wlen] = 0;
+ --- 143,149 ----
+ if ( wlen < 0 )
+ ereport(ERROR,
+
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! errmsg("translation failed from server
encoding to wchar_t")));
+
+ Assert(wlen<=len);
+ wstr[wlen] = 0;
+ ***************
+ *** 152,158 ****
+ if ( wlen < 0 )
+ ereport(ERROR,
+
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! errmsg("transalation failed from
wchar_t to server encoding %d", errno)));
+ Assert(wlen<=len);
+ out[wlen]='\0';
+ }
+ --- 169,175 ----
+ if ( wlen < 0 )
+ ereport(ERROR,
+
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! errmsg("translation failed from
wchar_t to server encoding %d", errno)));
+ Assert(wlen<=len);
+ out[wlen]='\0';
+ }
+ diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h
+ *** ../tsearch2.orig/ts_locale.h Fri Jan 12 10:53:11 2007
+ --- ./ts_locale.h Fri Jan 12 18:10:19 2007
+ ***************
+ *** 30,45 ****
+ #define TOUCHAR(x) (*((unsigned char*)(x)))
+
+ #ifdef TS_USE_WIDE
+
+ #ifdef WIN32
+
+ size_t wchar2char(char *to, const wchar_t *from, size_t len);
+ ! size_t char2wchar(wchar_t *to, const char *from, size_t len);
+ #else /* WIN32 */
+
+ ! /* correct mbstowcs */
+ ! #define char2wchar mbstowcs
+ #define wchar2char wcstombs
+ #endif /* WIN32 */
+
+ #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+ --- 30,46 ----
+ #define TOUCHAR(x) (*((unsigned char*)(x)))
+
+ #ifdef TS_USE_WIDE
+ + size_t char2wchar(wchar_t *to, const char *from, size_t len);
+
+ #ifdef WIN32
+
+ size_t wchar2char(char *to, const wchar_t *from, size_t len);
+ !
+ #else /* WIN32 */
+
+ ! /* correct wcstombs */
+ #define wchar2char wcstombs
+ +
+ #endif /* WIN32 */
+
+ #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+ ***************
+ *** 55,64 ****
+ */
+ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned
char)(c)) ) : false )
+
+ ! #define COPYCHAR(d,s) do { \
+ ! int lll = pg_mblen( s ); \
+ ! \
+ ! while( lll-- ) \
+ TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
+ } while(0)
+
+ --- 56,65 ----
+ */
+ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned
char)(c)) ) : false )
+
+ ! #define COPYCHAR(d,s) do { \
+ ! int lll = pg_mblen( s ); \
+ !
\
+ ! while( lll-- )
\
+ TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
+ } while(0)
+
diff -c -r -N ../tsearch2.orig/wordparser/parser.c ./wordparser/parser.c
*** ../tsearch2.orig/wordparser/parser.c Fri Jan 12 10:53:11 2007
--- ./wordparser/parser.c Fri Jan 12 18:10:38 2007
***************
*** 40,55 ****
#ifdef TS_USE_WIDE
/*
! * Use wide char code only when max encoding length > 1 and ctype != C.
! * Some operating systems fail with multi-byte encodings and a C locale.
! * Also, for a C locale there is no need to process as multibyte. From
! * backend/utils/adt/oracle_compat.c Teodor
*/
! if (prs->charmaxlen > 1 && !lc_ctype_is_c())
{
prs->usewide = true;
! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
}
else
--- 40,52 ----
#ifdef TS_USE_WIDE
/*
! * Use wide char code only when max encoding length > 1.
*/
! if (prs->charmaxlen > 1)
{
prs->usewide = true;
! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) *
(prs->lenstr+1));
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
}
else
***************
*** 83,107 ****
/*
* defining support function, equvalent is* macroses, but
! * working with any possible encodings and locales
*/
#ifdef TS_USE_WIDE
! #define p_iswhat(type)
\
! static int
\
! p_is##type(TParser *prs) {
\
! Assert( prs->state );
\
! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr +
prs->state->poschar ) ) : \
! is##type( (unsigned char)*( prs->str + prs->state->posbyte ) )
); \
! } \
!
\
! static int
\
! p_isnot##type(TParser *prs) {
\
! return !p_is##type(prs);
\
}
/* p_iseq should be used only for ascii symbols */
--- 80,178 ----
/*
* defining support function, equvalent is* macroses, but
! * working with any possible encodings and locales. Note,
! * that with multibyte encoding and C-locale isw* function may fail
! * or give wrong result. Note 2: multibyte encoding and C-locale
! * often are used for Asian languages.
*/
#ifdef TS_USE_WIDE
! #define p_iswhat(type)
\
! static int
\
! p_is##type(TParser *prs) {
\
! Assert( prs->state );
\
! if ( prs->usewide )
\
! {
\
! if ( lc_ctype_is_c() )
\
! return is##type( 0xff & *( prs->wstr +
prs->state->poschar) ); \
!
\
! return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar )
); \
! }
\
!
\
! return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) );
\
! }
\
!
\
! static int
\
! p_isnot##type(TParser *prs) {
\
! return !p_is##type(prs);
\
}
+ static int
+ p_isalnum(TParser *prs)
+ {
+ Assert( prs->state );
+
+ if (prs->usewide)
+ {
+ if (lc_ctype_is_c())
+ {
+ unsigned int c = *(unsigned int*)(prs->wstr +
prs->state->poschar);
+
+ /*
+ * any non-ascii symbol with multibyte encoding
+ * with C-locale is an alpha character
+ */
+ if ( c > 0x7f )
+ return 1;
+
+ return isalnum(0xff & c);
+ }
+
+ return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
+ }
+
+ return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte ));
+ }
+ static int
+ p_isnotalnum(TParser *prs)
+ {
+ return !p_isalnum(prs);
+ }
+
+ static int
+ p_isalpha(TParser *prs)
+ {
+ Assert( prs->state );
+
+ if (prs->usewide)
+ {
+ if (lc_ctype_is_c())
+ {
+ unsigned int c = *(prs->wstr + prs->state->poschar);
+
+ /*
+ * any non-ascii symbol with multibyte encoding
+ * with C-locale is an alpha character
+ */
+ if ( c > 0x7f )
+ return 1;
+
+ return isalpha(0xff & c);
+ }
+
+ return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
+ }
+
+ return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte ));
+ }
+
+ static int
+ p_isnotalpha(TParser *prs)
+ {
+ return !p_isalpha(prs);
+ }
/* p_iseq should be used only for ascii symbols */
***************
*** 111,128 ****
Assert(prs->state);
return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte)
== c)) ? 1 : 0;
}
#else /* TS_USE_WIDE */
! #define p_iswhat(type)
\
! static int
\
! p_is##type(TParser *prs) {
\
! Assert( prs->state );
\
! return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );
\
! } \
!
\
! static int
\
! p_isnot##type(TParser *prs) {
\
! return !p_is##type(prs);
\
}
--- 182,200 ----
Assert(prs->state);
return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte)
== c)) ? 1 : 0;
}
+
#else /* TS_USE_WIDE */
! #define p_iswhat(type)
\
! static int
\
! p_is##type(TParser *prs) {
\
! Assert( prs->state );
\
! return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );
\
! }
\
!
\
! static int
\
! p_isnot##type(TParser *prs) {
\
! return !p_is##type(prs);
\
}
***************
*** 132,141 ****
Assert(prs->state);
return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
}
- #endif /* TS_USE_WIDE */
p_iswhat(alnum)
p_iswhat(alpha)
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
--- 204,215 ----
Assert(prs->state);
return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
}
p_iswhat(alnum)
p_iswhat(alpha)
+
+ #endif /* TS_USE_WIDE */
+
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
---------------------------(end of broadcast)---------------------------
TIP 1: if posting/reading through Usenet, please send an appropriate
subscribe-nomail command to [EMAIL PROTECTED] so that your
message can get through to the mailing list cleanly