I have tested with local-enabled environment and found a bug. Included
is the new version of patches. 

Teodor, Oleg, what do you think about these patches?
If ok, shall I commit to CVS head?
--
Tatsuo Ishii
SRA OSS, Inc. Japan

> Hi,
> 
> Here are patches against tsearch2 with CVS head.  Currently tsearch2
> does not work with multibyte encoding which uses C locale. These
> patches are intended to solve the problem by using PostgreSQL in-house
> multibyte function instead of mbstowcs which does not work with C
> locale. Also iswalpha etc. will not be called in case of C locale
> since they are not working with it. Tested with the EUC_JP encoding
> (should be working with any multibye encodings). Existing single byte
> encodings should not be broken by the patches, I did not test though.
> --
> Tatsuo Ishii
> SRA OSS, Inc. Japan
Index: ts_locale.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.c
*** ts_locale.c 20 Nov 2006 14:03:30 -0000      1.7
--- ts_locale.c 4 Jan 2007 12:16:00 -0000
***************
*** 63,68 ****
--- 63,101 ----
  
        return mbstowcs(to, from, len);
  }
+ 
+ #else /* WIN32 */
+ 
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+       wchar_t *result;
+       size_t n;
+ 
+       if (to == NULL)
+               return 0;
+ 
+       if (lc_ctype_is_c())
+       {
+               /* allocate neccesary memory for "to" including NULL terminate 
*/
+               result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
+ 
+               /* do the conversion */
+               n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
+               if (n > 0)
+               {
+                       /* store the result */
+                       if (n > len)
+                               n = len;
+                       memcpy(to, result, n*sizeof(wchar_t));
+                       pfree(result);
+                       *(to + n) = '\0';
+               }
+               return n;
+       }
+       return mbstowcs(to, from, len);
+ }
+ 
  #endif   /* WIN32 */
  
  int
***************
*** 70,75 ****
--- 103,113 ----
  {
        wchar_t         character;
  
+       if (lc_ctype_is_c())
+       {
+               return isalpha(TOUCHAR(ptr));
+       }
+ 
        char2wchar(&character, ptr, 1);
  
        return iswalpha((wint_t) character);
***************
*** 80,85 ****
--- 118,128 ----
  {
        wchar_t         character;
  
+       if (lc_ctype_is_c())
+       {
+               return isprint(TOUCHAR(ptr));
+       }
+ 
        char2wchar(&character, ptr, 1);
  
        return iswprint((wint_t) character);
***************
*** 126,132 ****
                if ( wlen < 0 )
                        ereport(ERROR,
                                        
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                                        errmsg("transalation failed from 
server encoding to wchar_t")));
  
                Assert(wlen<=len);
                wstr[wlen] = 0;
--- 169,175 ----
                if ( wlen < 0 )
                        ereport(ERROR,
                                        
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                                        errmsg("translation failed from server 
encoding to wchar_t")));
  
                Assert(wlen<=len);
                wstr[wlen] = 0;
***************
*** 152,158 ****
                if ( wlen < 0 )
                        ereport(ERROR,
                                        
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                                        errmsg("transalation failed from 
wchar_t to server encoding %d", errno)));
                Assert(wlen<=len);
                out[wlen]='\0';
        }
--- 195,201 ----
                if ( wlen < 0 )
                        ereport(ERROR,
                                        
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                                        errmsg("translation failed from 
wchar_t to server encoding %d", errno)));
                Assert(wlen<=len);
                out[wlen]='\0';
        }
Index: ts_locale.h
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.h
*** ts_locale.h 4 Oct 2006 00:29:47 -0000       1.7
--- ts_locale.h 4 Jan 2007 12:16:00 -0000
***************
*** 38,45 ****
  #else                                                 /* WIN32 */
  
  /* correct mbstowcs */
- #define char2wchar mbstowcs
  #define wchar2char wcstombs
  #endif   /* WIN32 */
  
  #define t_isdigit(x)  ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
--- 38,46 ----
  #else                                                 /* WIN32 */
  
  /* correct mbstowcs */
  #define wchar2char wcstombs
+ size_t                char2wchar(wchar_t *to, const char *from, size_t len);
+ 
  #endif   /* WIN32 */
  
  #define t_isdigit(x)  ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 54,59 ****
--- 55,61 ----
   * t_iseq() should be called only for ASCII symbols
   */
  #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) 
) : false )
+ /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/
  
  #define COPYCHAR(d,s) do {                            \
        int lll = pg_mblen( s );                        \
Index: wordparser/parser.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
retrieving revision 1.11
diff -c -r1.11 parser.c
*** wordparser/parser.c 4 Oct 2006 00:29:47 -0000       1.11
--- wordparser/parser.c 4 Jan 2007 12:16:01 -0000
***************
*** 44,52 ****
         * Some operating systems fail with multi-byte encodings and a C locale.
         * Also, for a C locale there is no need to process as multibyte. From
         * backend/utils/adt/oracle_compat.c Teodor
         */
  
!       if (prs->charmaxlen > 1 && !lc_ctype_is_c())
        {
                prs->usewide = true;
                prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
--- 44,54 ----
         * Some operating systems fail with multi-byte encodings and a C locale.
         * Also, for a C locale there is no need to process as multibyte. From
         * backend/utils/adt/oracle_compat.c Teodor
+        *
+        * This is wrong assumption. even if locale is C, multibyte is necceary.
         */
  
!       if (prs->charmaxlen > 1)
        {
                prs->usewide = true;
                prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
***************
*** 92,98 ****
  static int                                                                    
                \
  p_is##type(TParser *prs) {                                                    
                \
        Assert( prs->state );                                                   
                \
!       return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + 
prs->state->poschar ) ) : \
                is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) 
);               \
  }     \
                                                                                
                \
--- 94,102 ----
  static int                                                                    
                \
  p_is##type(TParser *prs) {                                                    
                \
        Assert( prs->state );                                                   
                \
!       return ( ( prs->usewide ) ? \
!                        (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + 
prs->state->poschar)): \
!                         isw##type( (wint_t)*( prs->wstr + 
prs->state->poschar))): \
                is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) 
);               \
  }     \
                                                                                
                \
***************
*** 134,141 ****
  }
  #endif   /* TS_USE_WIDE */
  
! p_iswhat(alnum)
! p_iswhat(alpha)
  p_iswhat(digit)
  p_iswhat(lower)
  p_iswhat(print)
--- 138,197 ----
  }
  #endif   /* TS_USE_WIDE */
  
! static int p_isalnum(TParser *prs) {
!       Assert( prs->state );
! 
!       if (prs->usewide)
!       {
!               unsigned int c;
! 
!               c = *(prs->wstr + prs->state->poschar);
! 
!               if (lc_ctype_is_c())
!               {
!                       if (c > 0x7f)
!                               return 1;
!                       return isalnum(0xff & c);
!               }
!               else
!                       return iswalnum( (wint_t)*( prs->wstr + 
prs->state->poschar));
!       }
!       else
!               return isalnum( (unsigned char)*( prs->str + 
prs->state->posbyte ));
! }
! 
! static int    p_isnotalnum(TParser *prs)
! {
!       return !p_isalnum(prs);
! }
! 
! static int p_isalpha(TParser *prs) {
!       Assert( prs->state );
! 
!       if (prs->usewide)
!       {
!               unsigned int c;
! 
!               c = *(prs->wstr + prs->state->poschar);
! 
!               if (lc_ctype_is_c())
!               {
!                       if (c > 0x7f)
!                               return 1;
!                       return isalpha(0xff & c);
!               }
!               else
!                       return iswalpha( (wint_t)*( prs->wstr + 
prs->state->poschar));
!       }
!       else
!               return isalpha( (unsigned char)*( prs->str + 
prs->state->posbyte ));
! }
! 
! static int    p_isnotalpha(TParser *prs)
! {
!       return !p_isalpha(prs);
! }
! 
  p_iswhat(digit)
  p_iswhat(lower)
  p_iswhat(print)
---------------------------(end of broadcast)---------------------------
TIP 9: In versions below 8.0, the planner will ignore your desire to
       choose an index scan if your joining column's datatypes do not
       match

Reply via email to