There's a discussion over at http://www.postgresql.org/message-id/flat/2sa.dhu5.1hk1yrptnfy.1ml...@seznam.cz of an apparent error in our WIN1250 -> LATIN2 conversion. I looked into this and found that indeed, the code will happily translate certain characters for which there seems to be no justification. I made up a quick script that would recompute the conversion tables in latin2_and_win1250.c from the Unicode mapping files in src/backend/utils/mb/Unicode, and what it computes is shown in the attached diff. (Zeroes in the tables indicate codes with no translation, for which an error should be thrown.)
Having done that, I thought it would be a good idea to see if we had any other conversion tables that weren't directly based on the Unicode data. The only ones I could find were in cyrillic_and_mic.c, and those seem to be absolutely filled with errors, to the point where I wonder if they were made from the claimed encodings or some other ones. The attached patch recomputes those from the Unicode data, too. None of this data seems to have been touched since Tatsuo-san's original commit 969e0246, so it looks like we simply didn't vet that submission closely enough. I have not attempted to reverify the files in utils/mb/Unicode against the original Unicode Consortium data, but maybe we ought to do that before taking any further steps here. Anyway, what are we going to do about this? I'm concerned that simply shoving in corrections may cause problems for users. Almost certainly, we should not back-patch this kind of change. regards, tom lane
diff --git a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c index 5d1c59b..97e890d 100644 *** a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c --- b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c *************** iso2mic(const unsigned char *l, unsigned *** 433,439 **** 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, --- 433,439 ---- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x9a, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, *************** mic2iso(const unsigned char *mic, unsign *** 458,464 **** 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0x00, --- 458,464 ---- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0x00, *************** win12512mic(const unsigned char *l, unsi *** 485,494 **** 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, ! 0xb3, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x00, 0xb7, ! 0x00, 0x00, 0xb6, 0xa6, 0xad, 0x00, 0x00, 0x00, ! 0xa3, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x00, 0xa7, 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, --- 485,494 ---- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0xb3, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, ! 0xa3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, *************** mic2win1251(const unsigned char *mic, un *** 510,520 **** 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0xb8, 0xba, 0x00, 0xb3, 0xbf, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0xa8, 0xaa, 0x00, 0xb2, 0xaf, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, 0xfe, 0xe0, 0xe1, 0xf6, 0xe4, 0xe5, 0xf4, 0xe3, 0xf5, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xff, 0xf0, 0xf1, 0xf2, 0xf3, 0xe6, 0xe2, --- 510,520 ---- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xa0, 0x00, 0xb0, 0x00, 0xb7, 0x00, + 0x00, 0x00, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0xa8, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa9, 0xfe, 0xe0, 0xe1, 0xf6, 0xe4, 0xe5, 0xf4, 0xe3, 0xf5, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xff, 0xf0, 0xf1, 0xf2, 0xf3, 0xe6, 0xe2, *************** win8662mic(const unsigned char *l, unsig *** 539,554 **** 0xfb, 0xfd, 0xff, 0xf9, 0xf8, 0xfc, 0xe0, 0xf1, 0xc1, 0xc2, 0xd7, 0xc7, 0xc4, 0xc5, 0xd6, 0xda, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd2, 0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, 0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1, ! 0xb3, 0xa3, 0xb4, 0xa4, 0xb7, 0xa7, 0x00, 0x00, ! 0xb6, 0xa6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; latin2mic_with_table(l, p, len, LC_KOI8_R, PG_WIN866, win8662koi); --- 539,554 ---- 0xfb, 0xfd, 0xff, 0xf9, 0xf8, 0xfc, 0xe0, 0xf1, 0xc1, 0xc2, 0xd7, 0xc7, 0xc4, 0xc5, 0xd6, 0xda, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, ! 0x90, 0x91, 0x92, 0x81, 0x87, 0xb2, 0xb4, 0xa7, ! 0xa6, 0xb5, 0xa1, 0xa8, 0xae, 0xad, 0xac, 0x83, ! 0x84, 0x89, 0x88, 0x86, 0x80, 0x8a, 0xaf, 0xb0, ! 0xab, 0xa5, 0xbb, 0xb8, 0xb1, 0xa0, 0xbe, 0xb9, ! 0xba, 0xb6, 0xb7, 0xaa, 0xa9, 0xa2, 0xa4, 0xbd, ! 0xbc, 0x85, 0x82, 0x8d, 0x8c, 0x8e, 0x8f, 0x8b, 0xd2, 0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, 0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1, ! 0xb3, 0xa3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x9c, 0x95, 0x9e, 0x96, 0x00, 0x00, 0x94, 0x9a }; latin2mic_with_table(l, p, len, LC_KOI8_R, PG_WIN866, win8662koi); *************** static void *** 559,572 **** mic2win866(const unsigned char *mic, unsigned char *p, int len) { static const unsigned char koi2win866[] = { ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0xf1, 0xf3, 0x00, 0xf9, 0xf5, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0xf0, 0xf2, 0x00, 0xf8, 0xf4, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, 0xee, 0xa0, 0xa1, 0xe6, 0xa4, 0xa5, 0xe4, 0xa3, 0xe5, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xef, 0xe0, 0xe1, 0xe2, 0xe3, 0xa6, 0xa2, --- 559,572 ---- mic2win866(const unsigned char *mic, unsigned char *p, int len) { static const unsigned char koi2win866[] = { ! 0xc4, 0xb3, 0xda, 0xbf, 0xc0, 0xd9, 0xc3, 0xb4, ! 0xc2, 0xc1, 0xc5, 0xdf, 0xdc, 0xdb, 0xdd, 0xde, ! 0xb0, 0xb1, 0xb2, 0x00, 0xfe, 0xf9, 0xfb, 0x00, ! 0x00, 0x00, 0xff, 0x00, 0xf8, 0x00, 0xfa, 0x00, ! 0xcd, 0xba, 0xd5, 0xf1, 0xd6, 0xc9, 0xb8, 0xb7, ! 0xbb, 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, ! 0xc7, 0xcc, 0xb5, 0xf0, 0xb6, 0xb9, 0xd1, 0xd2, ! 0xcb, 0xcf, 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0xee, 0xa0, 0xa1, 0xe6, 0xa4, 0xa5, 0xe4, 0xa3, 0xe5, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xef, 0xe0, 0xe1, 0xe2, 0xe3, 0xa6, 0xa2, diff --git a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c index 8f831ba..5e35c75 100644 *** a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c --- b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c *************** static void *** 154,163 **** win12502mic(const unsigned char *l, unsigned char *p, int len) { static const unsigned char win1250_2_iso88592[] = { ! 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, ! 0x88, 0x89, 0xA9, 0x8B, 0xA6, 0xAB, 0xAE, 0xAC, ! 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, ! 0x98, 0x99, 0xB9, 0x9B, 0xB6, 0xBB, 0xBE, 0xBC, 0xA0, 0xB7, 0xA2, 0xA3, 0xA4, 0xA1, 0x00, 0xA7, 0xA8, 0x00, 0xAA, 0x00, 0x00, 0xAD, 0x00, 0xAF, 0xB0, 0x00, 0xB2, 0xB3, 0xB4, 0x00, 0x00, 0x00, --- 154,163 ---- win12502mic(const unsigned char *l, unsigned char *p, int len) { static const unsigned char win1250_2_iso88592[] = { ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0xA9, 0x00, 0xA6, 0xAB, 0xAE, 0xAC, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0xB9, 0x00, 0xB6, 0xBB, 0xBE, 0xBC, 0xA0, 0xB7, 0xA2, 0xA3, 0xA4, 0xA1, 0x00, 0xA7, 0xA8, 0x00, 0xAA, 0x00, 0x00, 0xAD, 0x00, 0xAF, 0xB0, 0x00, 0xB2, 0xB3, 0xB4, 0x00, 0x00, 0x00, *************** static void *** 180,189 **** mic2win1250(const unsigned char *mic, unsigned char *p, int len) { static const unsigned char iso88592_2_win1250[] = { ! 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, ! 0x88, 0x89, 0x00, 0x8B, 0x00, 0x00, 0x00, 0x00, ! 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, ! 0x98, 0x99, 0x00, 0x9B, 0x00, 0x00, 0x00, 0x00, 0xA0, 0xA5, 0xA2, 0xA3, 0xA4, 0xBC, 0x8C, 0xA7, 0xA8, 0x8A, 0xAA, 0x8D, 0x8F, 0xAD, 0x8E, 0xAF, 0xB0, 0xB9, 0xB2, 0xB3, 0xB4, 0xBE, 0x9C, 0xA1, --- 180,189 ---- mic2win1250(const unsigned char *mic, unsigned char *p, int len) { static const unsigned char iso88592_2_win1250[] = { ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ! 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA0, 0xA5, 0xA2, 0xA3, 0xA4, 0xBC, 0x8C, 0xA7, 0xA8, 0x8A, 0xAA, 0x8D, 0x8F, 0xAD, 0x8E, 0xAF, 0xB0, 0xB9, 0xB2, 0xB3, 0xB4, 0xBE, 0x9C, 0xA1,
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers