Re: Chinese LyX

Georg Baum Sun, 03 Sep 2006 12:48:07 -0700

Am Donnerstag, 24. August 2006 18:47 schrieb Abdelrazak Younes:
> Enrico Forestieri wrote:
> > I have compiled a Cygwin version of LyX/Qt4 using the native GUI (no 
X11)
> > and now I see the chinese characters mentioned by Abdel when I load
> > an old document. I also get a lot of messages on the console:
> > 
> > Error returned from iconv
> > EILSEQ An invalid multibyte sequence has been encountered in the input.
> > When converting from UCS-4 to UCS-2.
> > Input: 0xff 0xff 0xff 0xa2 
> > 
> > This was not the case with Qt3 where the characters were simply hollow
> > squares on screen. However, when I start a new document the characters
> > are shown correctly.
> 
> The attached patch at least make LyX show normal text correctly.


Abdel,

do you still need that patch?

There is something with the byte order I don't understand. I just found out 
that with the iconv on my little endian linux box UCS-4 == UCS-4BE. That 
means that the bytes coming from iconv are in big endian byte order. This 
is reversed in bytes_to_ucs4:

std::vector<lyx::char_type> ucs4;
for (size_t i = 0; i < bytes.size(); i += 4) {
        unsigned char const b1 = bytes[i    ];
        unsigned char const b2 = bytes[i + 1];
        unsigned char const b3 = bytes[i + 2];
        unsigned char const b4 = bytes[i + 3];

        // b1 is highest byte, b4 the lowest:
        // c = b1 * 8^3 + b2 * 8^2 + b3 * 8 + b4

        lyx::char_type c;
        char * cc = reinterpret_cast<char *>(&c);

        // The following assumes little endian storage:
        cc[3] = b1;
        cc[2] = b2;
        cc[1] = b3;
        cc[0] = b4;

        if (c > 0xffff) {
                lyxerr << "Strange ucs4 value encountered\n";
                lyxerr << "0x"
                       << std::setw(2) << std::setfill('0') << int(b1)
                       << std::setw(2) << std::setfill('0') << int(b2)
                       << std::setw(2) << std::setfill('0') << int(b3)
                       << std::setw(2) << std::setfill('0') << int(b4)
                       << ' ' << "(0x" << c << ") ";
        }
        ucs4.push_back(c);
}

If you still need the patch then I don't understand what happens. 
Furthermore the byte order change does not happen in bytes_to_ucs2.

I believe that the attached patch should make the unicode conversions work 
on little and big endian machines, and removes the uncertainty whether 
UCS-4 is LE or BE.

Using UCS4-INTERNAL and memmove() the result as suggested by Angus would be 
even better, but it seems that it is not available on windows.

Comments and tests on windows and Mac PPC are welcome.


Georg

Index: src/support/unicode.C
===================================================================
--- src/support/unicode.C	(Revision 14882)
+++ src/support/unicode.C	(Arbeitskopie)
@@ -122,10 +122,17 @@ std::vector<boost::uint32_t> bytes_to_uc
 
 		boost::uint32_t c;
 		char * cc = reinterpret_cast<char *>(&c);
+#ifdef WORDS_BIGENDIAN
+		cc[0] = b1;
+		cc[1] = b2;
+		cc[2] = b3;
+		cc[3] = b4;
+#else
 		cc[3] = b1;
 		cc[2] = b2;
 		cc[1] = b3;
 		cc[0] = b4;
+#endif
 
 		if (c > 0xffff) {
 			lyxerr << "Strange ucs4 value encountered\n";
@@ -158,8 +165,13 @@ std::vector<unsigned short> bytes_to_ucs
 
 		unsigned short c;
 		char * cc = reinterpret_cast<char *>(&c);
+#ifdef WORDS_BIGENDIAN
 		cc[0] = b1;
 		cc[1] = b2;
+#else
+		cc[1] = b1;
+		cc[0] = b2;
+#endif
 
 		//lyxerr << "0x"
 		//       << std::setw(2) << std::setfill('0') << int(b2)
@@ -185,7 +197,7 @@ std::vector<boost::uint32_t> utf8_to_ucs
 	//lyxerr << "Res = " << string(res.begin(), res.end())
 	//       << " (" << res.size() << ")" << endl;
 
-	std::vector<char> res = iconv_convert("UCS-4", "UTF-8", utf8str);
+	std::vector<char> res = iconv_convert("UCS-4BE", "UTF-8", utf8str);
 	return bytes_to_ucs4(res);
 }
 
@@ -200,13 +212,13 @@ ucs2_to_ucs4(std::vector<unsigned short>
 	//lyxerr << std::hex;
 	for (; cit != end; ++cit) {
 		unsigned short s = *cit;
-		in.push_back(static_cast<char>(s & 0x00ff));
 		in.push_back(static_cast<char>((s & 0xff00) >> 8));
-		lyxerr << std::setw(2) << std::setfill('0') << (s & 0x00ff) << endl;
+		in.push_back(static_cast<char>(s & 0x00ff));
 		lyxerr << std::setw(2) << std::setfill('0') << ((s & 0xff00) >> 8) << endl;
+		lyxerr << std::setw(2) << std::setfill('0') << (s & 0x00ff) << endl;
 	}
 
-	std::vector<char> res = iconv_convert("UCS-4", "UCS-2", in);
+	std::vector<char> res = iconv_convert("UCS-4BE", "UCS-2BE", in);
 	return bytes_to_ucs4(res);
 }
 
@@ -224,7 +236,7 @@ ucs4_to_ucs2(std::vector<boost::uint32_t
 		in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
 		in.push_back(static_cast<char>(s & 0x000000ff));
 	}
-	std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
+	std::vector<char> res = iconv_convert("UCS-2BE", "UCS-4BE", in);
 	return bytes_to_ucs2(res);
 }
 
@@ -239,7 +251,7 @@ ucs4_to_ucs2(boost::uint32_t const * s, 
 		in.push_back(static_cast<char>((s[i] & 0x0000ff00) >> 8));
 		in.push_back(static_cast<char>(s[i] & 0x000000ff));
 	}
-	std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
+	std::vector<char> res = iconv_convert("UCS-2BE", "UCS-4BE", in);
 	return bytes_to_ucs2(res);
 }
 
@@ -252,7 +264,7 @@ ucs4_to_ucs2(boost::uint32_t c)
 	in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
 	in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
 	in.push_back(static_cast<char>(c & 0x000000ff));
-	std::vector<char> res = iconv_convert("UCS-2", "UCS-4", in);
+	std::vector<char> res = iconv_convert("UCS-2BE", "UCS-4BE", in);
 	std::vector<unsigned short> us = bytes_to_ucs2(res);
 	if (!us.empty())
 		return us[0];
@@ -273,7 +285,7 @@ std::vector<char> ucs4_to_utf8(std::vect
 		in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
 		in.push_back(static_cast<char>(s & 0x000000ff));
 	}
-	std::vector<char> res = iconv_convert("UTF-8", "UCS-4", in);
+	std::vector<char> res = iconv_convert("UTF-8", "UCS-4BE", in);
 	return res;
 }
 
@@ -285,6 +297,6 @@ std::vector<char> ucs4_to_utf8(boost::ui
 	in.push_back(static_cast<char>((c & 0x00ff0000) >> 16));
 	in.push_back(static_cast<char>((c & 0x0000ff00) >> 8));
 	in.push_back(static_cast<char>(c & 0x000000ff));
-	std::vector<char> res = iconv_convert("UTF-8", "UCS-4", in);
+	std::vector<char> res = iconv_convert("UTF-8", "UCS-4BE", in);
 	return res;
 }
Index: configure.ac
===================================================================
--- configure.ac	(Revision 14882)
+++ configure.ac	(Arbeitskopie)
@@ -64,6 +64,9 @@ AC_PROG_CC
 AC_ISC_POSIX
 AC_AIX
 
+### we need to know the byte order for unicode conversions
+AC_C_BIGENDIAN
+
 ### check which frontend we want to use
 LYX_USE_FRONTENDS
 
Index: development/scons/SConstruct
===================================================================
--- development/scons/SConstruct	(Revision 14882)
+++ development/scons/SConstruct	(Arbeitskopie)
@@ -1100,6 +1100,10 @@ int count()
             (spell_engine is not None, spell_engine,
                 'Spell engine to use'
             ),
+            # we need to know the byte order for unicode conversions
+            (sys.byteorder == 'big', 'WORDS_BIGENDIAN'
+                'Define to 1 if your processor stores words with the most significant byte first (like Motorola and SPARC, unlike Intel and VAX).'
+            ),
         ],
         extra_items = [
             ('#define PACKAGE "%s%s"' % (package, program_suffix),

Re: Chinese LyX

Reply via email to