I wrote:
> Nonetheless, the code is certainly giving wrong answers for 4-byte
> characters.  Will go fix...

I've applied the attached patch for 8.1, and related patches in all
supported branches.

                        regards, tom lane


Index: wchar.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/mb/wchar.c,v
retrieving revision 1.47.2.4
diff -c -r1.47.2.4 wchar.c
*** wchar.c     22 Aug 2006 12:11:38 -0000      1.47.2.4
--- wchar.c     24 Jan 2007 16:16:27 -0000
***************
*** 345,362 ****
  }
  
  /*
!  * convert UTF8 string to pg_wchar (UCS-2)
!  * caller should allocate enough space for "to"
   * len: length of from.
   * "from" not necessarily null terminated.
   */
  static int
  pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  {
-       unsigned char c1,
-                               c2,
-                               c3;
        int                     cnt = 0;
  
        while (len > 0 && *from)
        {
--- 345,363 ----
  }
  
  /*
!  * convert UTF8 string to pg_wchar (UCS-4)
!  * caller must allocate enough space for "to", including a trailing zero!
   * len: length of from.
   * "from" not necessarily null terminated.
   */
  static int
  pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  {
        int                     cnt = 0;
+       uint32          c1,
+                               c2,
+                               c3,
+                               c4;
  
        while (len > 0 && *from)
        {
***************
*** 365,390 ****
                        *to = *from++;
                        len--;
                }
!               else if ((*from & 0xe0) == 0xc0 && len >= 2)
                {
                        c1 = *from++ & 0x1f;
                        c2 = *from++ & 0x3f;
!                       *to = c1 << 6;
!                       *to |= c2;
                        len -= 2;
                }
!               else if ((*from & 0xe0) == 0xe0 && len >= 3)
                {
                        c1 = *from++ & 0x0f;
                        c2 = *from++ & 0x3f;
                        c3 = *from++ & 0x3f;
!                       *to = c1 << 12;
!                       *to |= c2 << 6;
!                       *to |= c3;
                        len -= 3;
                }
                else
                {
                        *to = *from++;
                        len--;
                }
--- 366,404 ----
                        *to = *from++;
                        len--;
                }
!               else if ((*from & 0xe0) == 0xc0)
                {
+                       if (len < 2)
+                               break;                  /* drop trailing 
incomplete char */
                        c1 = *from++ & 0x1f;
                        c2 = *from++ & 0x3f;
!                       *to = (c1 << 6) | c2;
                        len -= 2;
                }
!               else if ((*from & 0xf0) == 0xe0)
                {
+                       if (len < 3)
+                               break;                  /* drop trailing 
incomplete char */
                        c1 = *from++ & 0x0f;
                        c2 = *from++ & 0x3f;
                        c3 = *from++ & 0x3f;
!                       *to = (c1 << 12) | (c2 << 6) | c3;
                        len -= 3;
                }
+               else if ((*from & 0xf8) == 0xf0)
+               {
+                       if (len < 4)
+                               break;                  /* drop trailing 
incomplete char */
+                       c1 = *from++ & 0x07;
+                       c2 = *from++ & 0x3f;
+                       c3 = *from++ & 0x3f;
+                       c4 = *from++ & 0x3f;
+                       *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
+                       len -= 4;
+               }
                else
                {
+                       /* treat a bogus char as length 1; not ours to raise 
error */
                        *to = *from++;
                        len--;
                }
***************
*** 396,407 ****
  }
  
  /*
!  * returns the byte length of a UTF8 character pointed to by s
   */
  int
  pg_utf_mblen(const unsigned char *s)
  {
!       int                     len = 1;
  
        if ((*s & 0x80) == 0)
                len = 1;
--- 410,429 ----
  }
  
  /*
!  * Return the byte length of a UTF8 character pointed to by s
!  *
!  * Note: in the current implementation we do not support UTF8 sequences
!  * of more than 4 bytes; hence do NOT return a value larger than 4.
!  * We return "1" for any leading byte that is either flat-out illegal or
!  * indicates a length larger than we support.
!  *
!  * pg_utf2wchar_with_len(), utf2ucs(), pg_utf8_islegal(), and perhaps
!  * other places would need to be fixed to change this.
   */
  int
  pg_utf_mblen(const unsigned char *s)
  {
!       int                     len;
  
        if ((*s & 0x80) == 0)
                len = 1;
***************
*** 411,421 ****
                len = 3;
        else if ((*s & 0xf8) == 0xf0)
                len = 4;
        else if ((*s & 0xfc) == 0xf8)
                len = 5;
        else if ((*s & 0xfe) == 0xfc)
                len = 6;
!       return (len);
  }
  
  static int
--- 433,447 ----
                len = 3;
        else if ((*s & 0xf8) == 0xf0)
                len = 4;
+ #ifdef NOT_USED
        else if ((*s & 0xfc) == 0xf8)
                len = 5;
        else if ((*s & 0xfe) == 0xfc)
                len = 6;
! #endif
!       else
!               len = 1;
!       return len;
  }
  
  static int

---------------------------(end of broadcast)---------------------------
TIP 5: don't forget to increase your free space map settings

Reply via email to