Am Donnerstag, 24. August 2006 18:47 schrieb Abdelrazak Younes: > Enrico Forestieri wrote: > > I have compiled a Cygwin version of LyX/Qt4 using the native GUI (no X11) > > and now I see the chinese characters mentioned by Abdel when I load > > an old document. I also get a lot of messages on the console: > > > > Error returned from iconv > > EILSEQ An invalid multibyte sequence has been encountered in the input. > > When converting from UCS-4 to UCS-2. > > Input: 0xff 0xff 0xff 0xa2 > > > > This was not the case with Qt3 where the characters were simply hollow > > squares on screen. However, when I start a new document the characters > > are shown correctly. > > The attached patch at least make LyX show normal text correctly.
Abdel, do you still need that patch? There is something with the byte order I don't understand. I just found out that with the iconv on my little endian linux box UCS-4 == UCS-4BE. That means that the bytes coming from iconv are in big endian byte order. This is reversed in bytes_to_ucs4: std::vector<lyx::char_type> ucs4; for (size_t i = 0; i < bytes.size(); i += 4) { unsigned char const b1 = bytes[i ]; unsigned char const b2 = bytes[i + 1]; unsigned char const b3 = bytes[i + 2]; unsigned char const b4 = bytes[i + 3]; // b1 is highest byte, b4 the lowest: // c = b1 * 8^3 + b2 * 8^2 + b3 * 8 + b4 lyx::char_type c; char * cc = reinterpret_cast<char *>(&c); // The following assumes little endian storage: cc[3] = b1; cc[2] = b2; cc[1] = b3; cc[0] = b4; if (c > 0xffff) { lyxerr << "Strange ucs4 value encountered\n"; lyxerr << "0x" << std::setw(2) << std::setfill('0') << int(b1) << std::setw(2) << std::setfill('0') << int(b2) << std::setw(2) << std::setfill('0') << int(b3) << std::setw(2) << std::setfill('0') << int(b4) << ' ' << "(0x" << c << ") "; } ucs4.push_back(c); } If you still need the patch then I don't understand what happens. Furthermore the byte order change does not happen in bytes_to_ucs2. I believe that the attached patch should make the unicode conversions work on little and big endian machines, and removes the uncertainty whether UCS-4 is LE or BE. Using UCS4-INTERNAL and memmove() the result as suggested by Angus would be even better, but it seems that it is not available on windows. Comments and tests on windows and Mac PPC are welcome. Georg
Index: src/support/unicode.C =================================================================== --- src/support/unicode.C (Revision 14882) +++ src/support/unicode.C (Arbeitskopie) @@ -122,10 +122,17 @@ std::vector<boost::uint32_t> bytes_to_uc boost::uint32_t c; char * cc = reinterpret_cast<char *>(&c); +#ifdef WORDS_BIGENDIAN + cc[0] = b1; + cc[1] = b2; + cc[2] = b3; + cc[3] = b4; +#else cc[3] = b1; cc[2] = b2; cc[1] = b3; cc[0] = b4; +#endif if (c > 0xffff) { lyxerr << "Strange ucs4 value encountered\n"; @@ -158,8 +165,13 @@ std::vector<unsigned short> bytes_to_ucs unsigned short c; char * cc = reinterpret_cast<char *>(&c); +#ifdef WORDS_BIGENDIAN cc[0] = b1; cc[1] = b2; +#else + cc[1] = b1; + cc[0] = b2; +#endif //lyxerr << "0x" // << std::setw(2) << std::setfill('0') << int(b2) @@ -185,7 +197,7 @@ std::vector<boost::uint32_t> utf8_to_ucs //lyxerr << "Res = " << string(res.begin(), res.end()) // << " (" << res.size() << ")" << endl; - std::vector<char> res = iconv_convert("UCS-4", "UTF-8", utf8str); + std::vector<char> res = iconv_convert("UCS-4BE", "UTF-8", utf8str); return bytes_to_ucs4(res); } @@ -200,13 +212,13 @@ ucs2_to_ucs4(std::vector<unsigned short> //lyxerr << std::hex; for (; cit != end; ++cit) { unsigned short s = *cit; - in.push_back(static_cast<char>(s & 0x00ff)); in.push_back(static_cast<char>((s & 0xff00) >> 8)); - lyxerr << std::setw(2) << std::setfill('0') << (s & 0x00ff) << endl; + in.push_back(static_cast<char>(s & 0x00ff)); lyxerr << std::setw(2) << std::setfill('0') << ((s & 0xff00) >> 8) << endl; + lyxerr << std::setw(2) << std::setfill('0') << (s & 0x00ff) << endl; } - std::vector<char> res = iconv_convert("UCS-4", "UCS-2", in); + std::vector<char> res = iconv_convert("UCS-4BE", "UCS-2BE", in); return bytes_to_ucs4(res); } @@ -224,7 +236,7 @@ ucs4_to_ucs2(std::vector<boost::uint32_t in.push_back(static_cast<char>((s & 0x0000ff00) >> 8)); in.push_back(static_cast<char>(s & 0x000000ff)); } - std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in); + std::vector<char> res = iconv_convert("UCS-2BE", "UCS-4BE", in); return bytes_to_ucs2(res); } @@ -239,7 +251,7 @@ ucs4_to_ucs2(boost::uint32_t const * s, in.push_back(static_cast<char>((s[i] & 0x0000ff00) >> 8)); in.push_back(static_cast<char>(s[i] & 0x000000ff)); } - std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in); + std::vector<char> res = iconv_convert("UCS-2BE", "UCS-4BE", in); return bytes_to_ucs2(res); } @@ -252,7 +264,7 @@ ucs4_to_ucs2(boost::uint32_t c) in.push_back(static_cast<char>((c & 0x00ff0000) >> 16)); in.push_back(static_cast<char>((c & 0x0000ff00) >> 8)); in.push_back(static_cast<char>(c & 0x000000ff)); - std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in); + std::vector<char> res = iconv_convert("UCS-2BE", "UCS-4BE", in); std::vector<unsigned short> us = bytes_to_ucs2(res); if (!us.empty()) return us[0]; @@ -273,7 +285,7 @@ std::vector<char> ucs4_to_utf8(std::vect in.push_back(static_cast<char>((s & 0x0000ff00) >> 8)); in.push_back(static_cast<char>(s & 0x000000ff)); } - std::vector<char> res = iconv_convert("UTF-8", "UCS-4", in); + std::vector<char> res = iconv_convert("UTF-8", "UCS-4BE", in); return res; } @@ -285,6 +297,6 @@ std::vector<char> ucs4_to_utf8(boost::ui in.push_back(static_cast<char>((c & 0x00ff0000) >> 16)); in.push_back(static_cast<char>((c & 0x0000ff00) >> 8)); in.push_back(static_cast<char>(c & 0x000000ff)); - std::vector<char> res = iconv_convert("UTF-8", "UCS-4", in); + std::vector<char> res = iconv_convert("UTF-8", "UCS-4BE", in); return res; } Index: configure.ac =================================================================== --- configure.ac (Revision 14882) +++ configure.ac (Arbeitskopie) @@ -64,6 +64,9 @@ AC_PROG_CC AC_ISC_POSIX AC_AIX +### we need to know the byte order for unicode conversions +AC_C_BIGENDIAN + ### check which frontend we want to use LYX_USE_FRONTENDS Index: development/scons/SConstruct =================================================================== --- development/scons/SConstruct (Revision 14882) +++ development/scons/SConstruct (Arbeitskopie) @@ -1100,6 +1100,10 @@ int count() (spell_engine is not None, spell_engine, 'Spell engine to use' ), + # we need to know the byte order for unicode conversions + (sys.byteorder == 'big', 'WORDS_BIGENDIAN' + 'Define to 1 if your processor stores words with the most significant byte first (like Motorola and SPARC, unlike Intel and VAX).' + ), ], extra_items = [ ('#define PACKAGE "%s%s"' % (package, program_suffix),