I sent this to -patches, but it has not shown up, so I resend to -hackers.
Comments on the matter so we can get this issue resolved welcome.
Kind Regards,
John Hansen
--------------------------------------------------------------------------
Hello,
Seing that the limit is still in place, attached patch against CVS.
Kind Regards,
John Hansen
Index: src/backend/utils/mb/wchar.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/mb/wchar.c,v
retrieving revision 1.38
diff -c -r1.38 wchar.c
*** src/backend/utils/mb/wchar.c 17 Sep 2004 21:59:57 -0000 1.38
--- src/backend/utils/mb/wchar.c 16 Nov 2004 04:06:01 -0000
***************
*** 343,348 ****
--- 343,373 ----
return (pg_euc_dsplen(s));
}
+ bool isLegalUTF8(const UTF8 *source, int len) {
+ if(pg_utf_mblen(source) > len) return false;
+ UTF8 a;
+ const UTF8 *srcptr = source + pg_utf_mblen(source);
+ switch (pg_utf_mblen(source)) {
+ default: return false;
+ /* Everything else falls through when "true"... */
+ case 6: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 5: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 2: if ((a = (*--srcptr)) > 0xBF) return false;
+ switch (*source) {
+ /* no fall-through in this inner switch */
+ case 0xE0: if (a < 0xA0) return false; break;
+ case 0xF0: if (a < 0x90) return false; break;
+ case 0xF4: if (a > 0x8F) return false; break;
+ default: if (a < 0x80) return false;
+ }
+ case 1: if (*source >= 0x80 && *source < 0xC2) return false;
+ if (*source > 0xFD) return false;
+ }
+ return true;
+ }
+
/*
* convert UTF-8 string to pg_wchar (UCS-2)
* caller should allocate enough space for "to"
***************
*** 350,404 ****
* "from" not necessarily null terminated.
*/
static int
! pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
{
! unsigned char c1,
! c2,
! c3;
! int cnt = 0;
!
! while (len > 0 && *from)
! {
! if ((*from & 0x80) == 0)
! {
! *to = *from++;
! len--;
! }
! else if ((*from & 0xe0) == 0xc0 && len >= 2)
! {
! c1 = *from++ & 0x1f;
! c2 = *from++ & 0x3f;
! *to = c1 << 6;
! *to |= c2;
! len -= 2;
! }
! else if ((*from & 0xe0) == 0xe0 && len >= 3)
! {
! c1 = *from++ & 0x0f;
! c2 = *from++ & 0x3f;
! c3 = *from++ & 0x3f;
! *to = c1 << 12;
! *to |= c2 << 6;
! *to |= c3;
! len -= 3;
! }
! else
! {
! *to = *from++;
! len--;
! }
! to++;
! cnt++;
! }
! *to = 0;
! return (cnt);
}
/*
* returns the byte length of a UTF-8 word pointed to by s
*/
int
! pg_utf_mblen(const unsigned char *s)
{
int len = 1;
--- 375,437 ----
* "from" not necessarily null terminated.
*/
static int
! pg_utf2wchar_with_len(const UTF8 *from, pg_wchar *to, int len)
{
! const UTF8* fromEnd = from + len;
! const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
! unsigned int cnt = 0;
! while (from < fromEnd) {
! UTF32 ch = 0;
! unsigned int extraBytesToRead = pg_utf_mblen(from) - 1;
! if (from + extraBytesToRead >= fromEnd) {
! cnt = 0; break;
! }
! /* Do this check whether lenient or strict */
! if (! isLegalUTF8(from, extraBytesToRead + 1)) {
! cnt = 0;
! break;
! }
! /*
! * The cases all fall through. See "Note A" below.
! */
! switch (extraBytesToRead) {
! case 5: ch += *from++; ch <<= 6;
! case 4: ch += *from++; ch <<= 6;
! case 3: ch += *from++; ch <<= 6;
! case 2: ch += *from++; ch <<= 6;
! case 1: ch += *from++; ch <<= 6;
! case 0: ch += *from++;
! }
! ch -= offsetsFromUTF8[extraBytesToRead];
!
! if (ch <= UNI_MAX_BMP) { /* character is <= 0xFFFF */
! if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
! from -= (extraBytesToRead+1); /* return to the illegal value itself */
! cnt = 0;
! break;
! } else {
! *to++ = ch; /* normal case */
! }
! } else if (ch > UNI_MAX_UTF16) {
! cnt = 0;
! from -= (extraBytesToRead+1); /* return to the start */
! break; /* Bail out; shouldn't continue */
! } else {
! /* character is in range 0xFFFF - 0x10FFFF. */
! ch -= 0x0010000UL;
! *to++ = (ch >> 10) + UNI_SUR_HIGH_START;
! *to++ = (ch & 0x3FFUL) + UNI_SUR_LOW_START;
! }
! cnt++;
! }
! return cnt;
}
/*
* returns the byte length of a UTF-8 word pointed to by s
*/
int
! pg_utf_mblen(const UTF8 *s)
{
int len = 1;
***************
*** 406,418 ****
len = 1;
else if ((*s & 0xe0) == 0xc0)
len = 2;
! else if ((*s & 0xe0) == 0xe0)
! len = 3;
return (len);
}
static int
! pg_utf_dsplen(const unsigned char *s)
{
return 1; /* XXX fix me! */
}
--- 439,457 ----
len = 1;
else if ((*s & 0xe0) == 0xc0)
len = 2;
! else if ((*s & 0xf0) == 0xe0)
! len = 3;
! else if ((*s & 0xf8) == 0xf0)
! len = 4;
! else if ((*s & 0xfc) == 0xf8)
! len = 5;
! else if ((*s & 0xfe) == 0xfc)
! len = 6;
return (len);
}
static int
! pg_utf_dsplen(const UTF8 *s)
{
return 1; /* XXX fix me! */
}
***************
*** 721,728 ****
{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */
{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */
{pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */
! {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3}, /* 6; PG_UNICODE */
! {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 10; PG_LATIN3 */
--- 760,767 ----
{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */
{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */
{pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */
! {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 6}, /* 6; PG_UNICODE */
! {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 10; PG_LATIN3 */
***************
*** 744,754 ****
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 26; ISO-8859-7 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 27; ISO-8859-8 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 28; PG_WIN1250 */
! {0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */
! {0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */
! {0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */
! {0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */
! {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */
};
/* returns the byte length of a word for mule internal code */
--- 783,793 ----
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 26; ISO-8859-7 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 27; ISO-8859-8 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 28; PG_WIN1250 */
! {0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */
! {0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */
! {0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */
! {0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */
! {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */
};
/* returns the byte length of a word for mule internal code */
***************
*** 823,837 ****
while (len > 0 && *mbstr)
{
/* special UTF-8 check */
! if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0)
! {
! if (noError)
! return false;
! ereport(ERROR,
! (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("Unicode characters greater than or equal to 0x10000 are not supported")));
}
!
l = pg_mblen(mbstr);
for (i = 1; i < l; i++)
--- 862,876 ----
while (len > 0 && *mbstr)
{
/* special UTF-8 check */
! if (encoding == PG_UTF8) {
! if(!isLegalUTF8(mbstr,len)) {
! if (noError) return false;
! ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence detected near character %c",*mbstr)));
! } else {
! return true;
! }
}
!
l = pg_mblen(mbstr);
for (i = 1; i < l; i++)
Index: src/include/mb/pg_wchar.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/mb/pg_wchar.h,v
retrieving revision 1.52
diff -c -r1.52 pg_wchar.h
*** src/include/mb/pg_wchar.h 17 Sep 2004 21:59:57 -0000 1.52
--- src/include/mb/pg_wchar.h 16 Nov 2004 04:06:02 -0000
***************
*** 16,21 ****
--- 16,35 ----
* The pg_wchar
*/
typedef unsigned int pg_wchar;
+ typedef unsigned int UTF32; /* at least 32 bits */
+ typedef unsigned int UTF16; /* at least 16 bits */
+ typedef unsigned char UTF8; /* typically 8 bits */
+
+ /* Some fundamental constants */
+ #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
+ #define UNI_MAX_BMP (UTF32)0x0000FFFF
+ #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
+ #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
+
+ #define UNI_SUR_HIGH_START (UTF32)0xD800
+ #define UNI_SUR_HIGH_END (UTF32)0xDBFF
+ #define UNI_SUR_LOW_START (UTF32)0xDC00
+ #define UNI_SUR_LOW_END (UTF32)0xDFFF
/*
* various definitions for EUC
***************
*** 339,342 ****
--- 353,358 ----
extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab);
extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab);
+ extern bool isLegalUTF8(const UTF8 *source, int len);
+
#endif /* PG_WCHAR_H */
---------------------------(end of broadcast)---------------------------
TIP 8: explain analyze is your friend