Patch applied. Thanks. ---------------------------------------------------------------------------
John Hansen wrote: > Bruce, > > Attached patch replaces the original, applied today against CVS HEAD. > Fixes the surrogates, and limits to 4 byte utf8 as per spec. > > Also extends UtfToLocal to 4 byte characters (tho, it does not add any, > just enables the code to handle them. If my interpretation of this code > is wrong, please let me know, and correct it). > > ... John > > > -----Original Message----- > > From: Bruce Momjian [mailto:[EMAIL PROTECTED] > > Sent: Sunday, June 05, 2005 11:23 AM > > To: pgman@candle.pha.pa.us > > Cc: John Hansen; pgsql-hackers@postgresql.org; PostgreSQL-patches > > Subject: Re: [PATCHES] Unicode characters above 0x10000 #2 > > > > > > Your patch has been added to the PostgreSQL unapplied patches list at: > > > > http://momjian.postgresql.org/cgi-bin/pgpatches > > > > It will be applied as soon as one of the PostgreSQL > > committers reviews and approves it. > > > > -------------------------------------------------------------- > > ------------- > > > > > > pgman wrote: > > > > > > I have backed out this patch. It is unclear it is a bug fix. > > > > > > It will be saved for 8.1. > > > > > > > > ---------------------------------------------------------------------- > > > ----- > > > > > > pgman wrote: > > > > > > > > Patch applied. Thanks. > > > > > > > > > > -------------------------------------------------------------------- > > > > ------- > > > > > > > > > > > > John Hansen wrote: > > > > > 3 times lucky? > > > > > > > > > > Last one broke utf8.... Grrrr > > > > > > > > > > This one works,.... Too tired, sorry for the inconvenience.. > > > > > > > > > > ... John > > > > > > > > Content-Description: cvs.diff > > > > > > > > [ Attachment, skipping... ] > > > > > > > > > > > > > > ---------------------------(end of > > > > > broadcast)--------------------------- > > > > > TIP 9: the planner will ignore your desire to choose an > > index scan if your > > > > > joining column's datatypes do not match > > > > > > > > -- > > > > Bruce Momjian | http://candle.pha.pa.us > > > > pgman@candle.pha.pa.us | (610) 359-1001 > > > > + If your life is a hard drive, | 13 Roberts Road > > > > + Christ can be your backup. | Newtown Square, > > Pennsylvania 19073 > > > > > > -- > > > Bruce Momjian | http://candle.pha.pa.us > > > pgman@candle.pha.pa.us | (610) 359-1001 > > > + If your life is a hard drive, | 13 Roberts Road > > > + Christ can be your backup. | Newtown Square, > > Pennsylvania 19073 > > > > > =================================================================== > > > RCS file: /projects/cvsroot/pgsql/src/backend/utils/mb/wchar.c,v > > > retrieving revision 1.38 > > > diff -c -r1.38 wchar.c > > > *** src/backend/utils/mb/wchar.c 17 Sep 2004 21:59:57 > > -0000 1.38 > > > --- src/backend/utils/mb/wchar.c 21 Nov 2004 09:58:36 -0000 > > > *************** > > > *** 343,348 **** > > > --- 343,373 ---- > > > return (pg_euc_dsplen(s)); > > > } > > > > > > + bool isLegalUTF8(const UTF8 *source, int len) { > > > + UTF8 a; > > > + const UTF8 *srcptr = source+len; > > > + if(!source || (pg_utf_mblen(source) != len)) return false; > > > + switch (len) { > > > + default: return false; > > > + /* Everything else falls through when "true"... */ > > > + case 6: if ((a = (*--srcptr)) < 0x80 || a > > > 0xBF) return false; > > > + case 5: if ((a = (*--srcptr)) < 0x80 || a > > > 0xBF) return false; > > > + case 4: if ((a = (*--srcptr)) < 0x80 || a > > > 0xBF) return false; > > > + case 3: if ((a = (*--srcptr)) < 0x80 || a > > > 0xBF) return false; > > > + case 2: if ((a = (*--srcptr)) > 0xBF) return false; > > > + switch (*source) { > > > + /* no fall-through in this inner switch */ > > > + case 0xE0: if (a < 0xA0) return false; break; > > > + case 0xF0: if (a < 0x90) return false; break; > > > + case 0xF4: if (a > 0x8F) return false; break; > > > + default: if (a < 0x80) return false; > > > + } > > > + case 1: if (*source >= 0x80 && *source < > > 0xC2) return false; > > > + if (*source > 0xFD) return false; > > > + } > > > + return true; > > > + } > > > + > > > /* > > > * convert UTF-8 string to pg_wchar (UCS-2) > > > * caller should allocate enough space for "to" > > > *************** > > > *** 398,404 **** > > > * returns the byte length of a UTF-8 word pointed to by s > > > */ > > > int > > > ! pg_utf_mblen(const unsigned char *s) > > > { > > > int len = 1; > > > > > > --- 423,429 ---- > > > * returns the byte length of a UTF-8 word pointed to by s > > > */ > > > int > > > ! pg_utf_mblen(const UTF8 *s) > > > { > > > int len = 1; > > > > > > *************** > > > *** 406,418 **** > > > len = 1; > > > else if ((*s & 0xe0) == 0xc0) > > > len = 2; > > > ! else if ((*s & 0xe0) == 0xe0) > > > ! len = 3; > > > return (len); > > > } > > > > > > static int > > > ! pg_utf_dsplen(const unsigned char *s) > > > { > > > return 1; /* XXX > > fix me! */ > > > } > > > --- 431,449 ---- > > > len = 1; > > > else if ((*s & 0xe0) == 0xc0) > > > len = 2; > > > ! else if ((*s & 0xf0) == 0xe0) > > > ! len = 3; > > > ! else if ((*s & 0xf8) == 0xf0) > > > ! len = 4; > > > ! else if ((*s & 0xfc) == 0xf8) > > > ! len = 5; > > > ! else if ((*s & 0xfe) == 0xfc) > > > ! len = 6; > > > return (len); > > > } > > > > > > static int > > > ! pg_utf_dsplen(const UTF8 *s) > > > { > > > return 1; /* XXX > > fix me! */ > > > } > > > *************** > > > *** 721,728 **** > > > {pg_euckr2wchar_with_len, pg_euckr_mblen, > > pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */ > > > {pg_euctw2wchar_with_len, pg_euctw_mblen, > > pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */ > > > {pg_johab2wchar_with_len, pg_johab_mblen, > > pg_johab_dsplen, 3}, /* 5; PG_JOHAB */ > > > ! {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, > > 3}, /* 6; PG_UNICODE */ > > > ! {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, > > 3}, /* 7; PG_MULE_INTERNAL */ > > > {pg_latin12wchar_with_len, pg_latin1_mblen, > > pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */ > > > {pg_latin12wchar_with_len, pg_latin1_mblen, > > pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */ > > > {pg_latin12wchar_with_len, pg_latin1_mblen, > > pg_latin1_dsplen, 1}, /* 10; PG_LATIN3 */ > > > --- 752,759 ---- > > > {pg_euckr2wchar_with_len, pg_euckr_mblen, > > pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */ > > > {pg_euctw2wchar_with_len, pg_euctw_mblen, > > pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */ > > > {pg_johab2wchar_with_len, pg_johab_mblen, > > pg_johab_dsplen, 3}, /* 5; PG_JOHAB */ > > > ! {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, > > 6}, /* 6; PG_UNICODE */ > > > ! {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, > > 3}, /* 7; PG_MULE_INTERNAL */ > > > {pg_latin12wchar_with_len, pg_latin1_mblen, > > pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */ > > > {pg_latin12wchar_with_len, pg_latin1_mblen, > > pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */ > > > {pg_latin12wchar_with_len, pg_latin1_mblen, > > pg_latin1_dsplen, 1}, /* 10; PG_LATIN3 */ > > > *************** > > > *** 744,754 **** > > > {pg_latin12wchar_with_len, pg_latin1_mblen, > > pg_latin1_dsplen, 1}, /* 26; ISO-8859-7 */ > > > {pg_latin12wchar_with_len, pg_latin1_mblen, > > pg_latin1_dsplen, 1}, /* 27; ISO-8859-8 */ > > > {pg_latin12wchar_with_len, pg_latin1_mblen, > > pg_latin1_dsplen, 1}, /* 28; PG_WIN1250 */ > > > ! {0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; > > PG_SJIS */ > > > ! {0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; > > PG_BIG5 */ > > > ! {0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */ > > > ! {0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */ > > > ! {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */ > > > }; > > > > > > /* returns the byte length of a word for mule internal code */ > > > --- 775,785 ---- > > > {pg_latin12wchar_with_len, pg_latin1_mblen, > > pg_latin1_dsplen, 1}, /* 26; ISO-8859-7 */ > > > {pg_latin12wchar_with_len, pg_latin1_mblen, > > pg_latin1_dsplen, 1}, /* 27; ISO-8859-8 */ > > > {pg_latin12wchar_with_len, pg_latin1_mblen, > > pg_latin1_dsplen, 1}, /* 28; PG_WIN1250 */ > > > ! {0, pg_sjis_mblen, pg_sjis_dsplen, 2}, > > /* 29; PG_SJIS */ > > > ! {0, pg_big5_mblen, pg_big5_dsplen, 2}, > > /* 30; PG_BIG5 */ > > > ! {0, pg_gbk_mblen, pg_gbk_dsplen, 2}, > > /* 31; PG_GBK */ > > > ! {0, pg_uhc_mblen, pg_uhc_dsplen, 2}, > > /* 32; PG_UHC */ > > > ! {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} > > /* 33; PG_GB18030 */ > > > }; > > > > > > /* returns the byte length of a word for mule internal code */ > > > *************** > > > *** 822,872 **** > > > > > > while (len > 0 && *mbstr) > > > { > > > - /* special UTF-8 check */ > > > - if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0) > > > - { > > > - if (noError) > > > - return false; > > > - ereport(ERROR, > > > - > > (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), > > > - errmsg("Unicode > > characters greater than or equal to 0x10000 are not supported"))); > > > - } > > > - > > > l = pg_mblen(mbstr); > > > > > > ! for (i = 1; i < l; i++) > > > ! { > > > ! /* > > > ! * we expect that every multibyte char > > consists of bytes > > > ! * having the 8th bit set > > > ! */ > > > ! if (i >= len || (mbstr[i] & 0x80) == 0) > > > { > > > ! char buf[8 * 2 + 1]; > > > ! char *p = buf; > > > ! int j, > > > jlimit; > > > > > > ! if (noError) > > > ! return false; > > > > > > ! jlimit = Min(l, len); > > > ! jlimit = Min(jlimit, 8); > > /* prevent buffer overrun */ > > > > > > ! for (j = 0; j < jlimit; j++) > > > ! p += sprintf(p, "%02x", > > mbstr[j]); > > > > > > ! ereport(ERROR, > > > ! > > (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), > > > ! errmsg("invalid byte sequence > > for encoding \"%s\": 0x%s", > > > ! > > GetDatabaseEncodingName(), buf))); > > > } > > > - } > > > > > > len -= l; > > > mbstr += l; > > > } > > > - > > > return true; > > > } > > > > > > --- 853,900 ---- > > > > > > while (len > 0 && *mbstr) > > > { > > > l = pg_mblen(mbstr); > > > > > > ! /* special UTF-8 check */ > > > ! if (encoding == PG_UTF8) { > > > ! if(!isLegalUTF8(mbstr,l)) { > > > ! if (noError) return false; > > > ! > > ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),er > > rmsg("Invalid UNICODE byte sequence detected near character > > %c",*mbstr))); > > > ! } > > > ! } else { > > > ! for (i = 1; i < l; i++) > > > { > > > ! /* > > > ! * we expect that every > > multibyte char consists of bytes > > > ! * having the 8th bit set > > > ! */ > > > ! if (i >= len || (mbstr[i] & 0x80) == 0) > > > ! { > > > ! char buf[8 * 2 + 1]; > > > ! char *p = buf; > > > ! int j, > > > jlimit; > > > > > > ! if (noError) > > > ! return false; > > > > > > ! jlimit = Min(l, len); > > > ! jlimit = Min(jlimit, > > 8); /* prevent buffer overrun */ > > > > > > ! for (j = 0; j < jlimit; j++) > > > ! p += sprintf(p, > > "%02x", mbstr[j]); > > > > > > ! ereport(ERROR, > > > ! > > (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), > > > ! errmsg("invalid byte > > sequence for encoding \"%s\": 0x%s", > > > ! > > GetDatabaseEncodingName(), buf))); > > > ! } > > > } > > > > > > + } > > > len -= l; > > > mbstr += l; > > > } > > > return true; > > > } > > > > > > Index: src/include/mb/pg_wchar.h > > > =================================================================== > > > RCS file: /projects/cvsroot/pgsql/src/include/mb/pg_wchar.h,v > > > retrieving revision 1.52 > > > diff -c -r1.52 pg_wchar.h > > > *** src/include/mb/pg_wchar.h 17 Sep 2004 21:59:57 > > -0000 1.52 > > > --- src/include/mb/pg_wchar.h 21 Nov 2004 09:58:36 -0000 > > > *************** > > > *** 17,22 **** > > > --- 17,30 ---- > > > */ > > > typedef unsigned int pg_wchar; > > > > > > + > > > + /* > > > + * The UTF types > > > + */ > > > + typedef unsigned int UTF32; /* at least 32 bits */ > > > + typedef unsigned short UTF16; /* at least 16 bits */ > > > + typedef unsigned char UTF8; /* typically 8 bits */ > > > + > > > /* > > > * various definitions for EUC > > > */ > > > *************** > > > *** 339,342 **** > > > --- 347,352 ---- > > > extern void latin2mic_with_table(unsigned char *l, > > unsigned char *p, int len, int lc, unsigned char *tab); > > > extern void mic2latin_with_table(unsigned char *mic, > > unsigned char > > > *p, int len, int lc, unsigned char *tab); > > > > > > + extern bool isLegalUTF8(const UTF8 *source, int len); > > > + > > > #endif /* PG_WCHAR_H */ > > > > > > > > > > -- > > Bruce Momjian | http://candle.pha.pa.us > > pgman@candle.pha.pa.us | (610) 359-1001 > > + If your life is a hard drive, | 13 Roberts Road > > + Christ can be your backup. | Newtown Square, > > Pennsylvania 19073 > > > > Content-Description: unicode.diff [ Attachment, skipping... ] -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 359-1001 + If your life is a hard drive, | 13 Roberts Road + Christ can be your backup. | Newtown Square, Pennsylvania 19073 ---------------------------(end of broadcast)--------------------------- TIP 1: subscribe and unsubscribe commands go to [EMAIL PROTECTED]