Up to now there have been two proposals how to fix the plain text output:

1) from Abdel: Still use narrow streams

virtual int InsetBase::plaintext(Buffer const &, std::ostream & os, 
OutputParams const &) const;

and implement operators for docstring and char_type output that convert to 
utf8:

std::ostream & operator<<(std::ostream & os, lyx::char_type const &);
std::ostream & operator<<(std::ostream & os, lyx::docstring const &);


2) from me: Change the stream type to use lyx::char_type as character type 
and do the conversion to utf8 in a special file stream:

virtual int InsetBase::plaintext(Buffer const &, 
std::basic_ostream<lyx::char_type> & os, OutputParams const &) const;

1) is easy to implement, but it would either require ucs4 -> utf8 -> ucs4 
conversions or some code duplication/refactoring since plain text output 
is also used internally. 2) has some problems: gcc does not have useful 
std::locale::facet specializations for anything else than char and wchar_t 
character types. AFAICS the ctype (for all streams) and codecvt (for file 
streams) facets are the most important ones. BTW I wrote earlier that 
char_traits<lyx::char_type> were a problem, but that is only true for 
older gcc versions, and I already put the relevant parts from gcc 4.2 in 
docstring.h, so this problem is solved.
I tried to pull the wchar_t specialization out of the relevant portions of 
libstdc++, but failed to create a working version for lyx::char_type. They 
are not only scattered over many files, they also appear in some internal 
init function that we can't modify of course, so I did not succeed.

Unfortunately we need to solve these problems even if we are going to use 
solution 1), since otherwise we are not able to use stringstreams for 
docstring, and I don't think that we can live without them.

At least on linux (and other OSes where sizeof(whcar_t) == 4) there is a 
very easy solution:

typedef wchar_t lyx::char_type;

Then we can easily use wide string streams, and also the conversion to utf8 
can easily be done transparently with something like the 
utf8_codecvt_facet in the attached file (which uses btw iconv with no 
copying of data). Fortunately Peter confirmed that the existing 
lyx::char_type works at least for stringstreams on windows.

Peter and Abdel, can you please test whether the attached test program 
works (or could be made to work) on windows with lyx::char_type == 
boost::uint32_t?

If yes, then I'd like to put the attached patch in and proceed with 
solution 2) above.


Georg
Index: src/lyxlex_pimpl.C
===================================================================
--- src/lyxlex_pimpl.C	(Revision 14882)
+++ src/lyxlex_pimpl.C	(Arbeitskopie)
@@ -73,7 +73,7 @@ string const LyXLex::Pimpl::getString() 
 
 lyx::docstring const LyXLex::Pimpl::getDocString() const
 {
-        std::vector<boost::uint32_t> res = utf8_to_ucs4(buff);
+        std::vector<lyx::char_type> res = utf8_to_ucs4(buff);
         lyx::docstring dstr(res.begin(), res.end());
         return dstr;
 }
Index: src/frontends/qt3/QLPainter.C
===================================================================
--- src/frontends/qt3/QLPainter.C	(Revision 14882)
+++ src/frontends/qt3/QLPainter.C	(Arbeitskopie)
@@ -222,7 +222,7 @@ void QLPainter::text(int x, int y, lyx::
 		// Brain-dead MSVC wants at(i) rather than operator[]
 		str.at(i) = QChar(encoding->ucs(s[i]));
 #else
-	//std::vector<boost::uint32_t> in(s, s + ls);
+	//std::vector<lyx::char_type> in(s, s + ls);
 	//std::vector<unsigned short> ucs2 = ucs4_to_ucs2(in);
 	std::vector<unsigned short> ucs2 = ucs4_to_ucs2(s, ls);
 	ucs2.push_back(0);
Index: src/frontends/qt4/qt_helpers.C
===================================================================
--- src/frontends/qt4/qt_helpers.C	(Revision 14882)
+++ src/frontends/qt4/qt_helpers.C	(Arbeitskopie)
@@ -159,13 +159,13 @@ void qstring_to_ucs4(QString const & qst
 	int ls = qstr.size();
 	ucs4.clear();
 	for (int i = 0; i < ls; ++i)
-		ucs4.push_back(static_cast<boost::uint32_t>(qstr[i].unicode()));
+		ucs4.push_back(static_cast<lyx::char_type>(qstr[i].unicode()));
 }
 
 
 char_type const qchar_to_ucs4(QChar const & qchar)
 {
-	return static_cast<boost::uint32_t>(qchar.unicode());
+	return static_cast<lyx::char_type>(qchar.unicode());
 }
 
 
Index: src/support/unicode.C
===================================================================
--- src/support/unicode.C	(Revision 14882)
+++ src/support/unicode.C	(Arbeitskopie)
@@ -109,18 +109,18 @@ iconv_convert(std::string const & tocode
 }
 
 
-std::vector<boost::uint32_t> bytes_to_ucs4(std::vector<char> const & bytes)
+std::vector<lyx::char_type> bytes_to_ucs4(std::vector<char> const & bytes)
 {
 	//lyxerr << "Outbuf =" << std::hex;
 
-	std::vector<boost::uint32_t> ucs4;
+	std::vector<lyx::char_type> ucs4;
 	for (size_t i = 0; i < bytes.size(); i += 4) {
 		unsigned char const b1 = bytes[i    ];
 		unsigned char const b2 = bytes[i + 1];
 		unsigned char const b3 = bytes[i + 2];
 		unsigned char const b4 = bytes[i + 3];
 
-		boost::uint32_t c;
+		lyx::char_type c;
 		char * cc = reinterpret_cast<char *>(&c);
 		cc[3] = b1;
 		cc[2] = b2;
@@ -178,7 +178,7 @@ std::vector<unsigned short> bytes_to_ucs
 } // anon namespace
 
 
-std::vector<boost::uint32_t> utf8_to_ucs4(std::vector<char> const & utf8str)
+std::vector<lyx::char_type> utf8_to_ucs4(std::vector<char> const & utf8str)
 {
 	//lyxerr << "Buff = " << string(utf8str.begin(), utf8str.end())
 	//       << " (" << utf8str.size() << ")" << endl;
@@ -190,7 +190,7 @@ std::vector<boost::uint32_t> utf8_to_ucs
 }
 
 
-std::vector<boost::uint32_t>
+std::vector<lyx::char_type>
 ucs2_to_ucs4(std::vector<unsigned short> const & ucs2str)
 {
 	// TODO: Simplify and speed up.
@@ -212,13 +212,13 @@ ucs2_to_ucs4(std::vector<unsigned short>
 
 
 std::vector<unsigned short>
-ucs4_to_ucs2(std::vector<boost::uint32_t> const & ucs4str)
+ucs4_to_ucs2(std::vector<lyx::char_type> const & ucs4str)
 {
 	std::vector<char> in;
-	std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
-	std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
+	std::vector<lyx::char_type>::const_iterator cit = ucs4str.begin();
+	std::vector<lyx::char_type>::const_iterator end = ucs4str.end();
 	for (; cit != end; ++cit) {
-		boost::uint32_t s = *cit;
+		lyx::char_type s = *cit;
 		in.push_back(static_cast<char>((s & 0xff000000) >> 24));
 		in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
 		in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
@@ -230,7 +230,7 @@ ucs4_to_ucs2(std::vector<boost::uint32_t
 
 
 std::vector<unsigned short>
-ucs4_to_ucs2(boost::uint32_t const * s, size_t ls)
+ucs4_to_ucs2(lyx::char_type const * s, size_t ls)
 {
 	std::vector<char> in;
 	for (size_t i = 0; i < ls; ++i) {
@@ -245,7 +245,7 @@ ucs4_to_ucs2(boost::uint32_t const * s, 
 
 
 unsigned short
-ucs4_to_ucs2(boost::uint32_t c)
+ucs4_to_ucs2(lyx::char_type c)
 {
 	std::vector<char> in;
 	in.push_back(static_cast<char>((c & 0xff000000) >> 24));
@@ -261,13 +261,13 @@ ucs4_to_ucs2(boost::uint32_t c)
 }
 
 
-std::vector<char> ucs4_to_utf8(std::vector<boost::uint32_t> const & ucs4str)
+std::vector<char> ucs4_to_utf8(std::vector<lyx::char_type> const & ucs4str)
 {
 	std::vector<char> in;
-	std::vector<boost::uint32_t>::const_iterator cit = ucs4str.begin();
-	std::vector<boost::uint32_t>::const_iterator end = ucs4str.end();
+	std::vector<lyx::char_type>::const_iterator cit = ucs4str.begin();
+	std::vector<lyx::char_type>::const_iterator end = ucs4str.end();
 	for (; cit != end; ++cit) {
-		boost::uint32_t s = *cit;
+		lyx::char_type s = *cit;
 		in.push_back(static_cast<char>((s & 0xff000000) >> 24));
 		in.push_back(static_cast<char>((s & 0x00ff0000) >> 16));
 		in.push_back(static_cast<char>((s & 0x0000ff00) >> 8));
@@ -278,7 +278,7 @@ std::vector<char> ucs4_to_utf8(std::vect
 }
 
 
-std::vector<char> ucs4_to_utf8(boost::uint32_t c)
+std::vector<char> ucs4_to_utf8(lyx::char_type c)
 {
 	std::vector<char> in;
 	in.push_back(static_cast<char>((c & 0xff000000) >> 24));
Index: src/support/docstring.C
===================================================================
--- src/support/docstring.C	(Revision 14882)
+++ src/support/docstring.C	(Arbeitskopie)
@@ -40,7 +40,7 @@ docstring const from_ascii(std::string c
 
 docstring const from_utf8(std::string const & utf8)
 {
-	std::vector<boost::uint32_t> const ucs4 =
+	std::vector<char_type> const ucs4 =
 		utf8_to_ucs4(std::vector<char>(utf8.begin(), utf8.end()));
 	return docstring(ucs4.begin(), ucs4.end());
 }
@@ -49,7 +49,7 @@ docstring const from_utf8(std::string co
 std::string const to_utf8(docstring const & ucs4)
 {
 	std::vector<char> const utf8 =
-		ucs4_to_utf8(std::vector<boost::uint32_t>(ucs4.begin(), ucs4.end()));
+		ucs4_to_utf8(std::vector<char_type>(ucs4.begin(), ucs4.end()));
 	return std::string(utf8.begin(), utf8.end());
 }
 
Index: src/support/types.h
===================================================================
--- src/support/types.h	(Revision 14882)
+++ src/support/types.h	(Arbeitskopie)
@@ -18,20 +18,10 @@
 
 #include "docstring.h"
 
-#include <boost/cstdint.hpp>
-
 #include <cstddef>
-#include <string>
 
 namespace lyx {
 
-	// The type used to hold characters in paragraphs
-	typedef boost::uint32_t char_type; // Possibly the ucs-4 type we will use
-	//typedef wchar_t char_type;  // The wide char type CJK-LyX uses
-	//typedef char char_type;       // Current narrow char type in use
-
-	//typedef std::wstring docstring;
-
 	/// a type for positions used in paragraphs
 	// needs to be signed for a while to hold the special value -1 that is
 	// used there
Index: src/support/unicode.h
===================================================================
--- src/support/unicode.h	(Revision 14882)
+++ src/support/unicode.h	(Arbeitskopie)
@@ -13,28 +13,29 @@
 #ifndef LYX_SUPPORT_UNICODE_H
 #define LYX_SUPPORT_UNICODE_H
 
-#include <boost/cstdint.hpp>
+#include "support/types.h"
+
 #include <vector>
 
-std::vector<boost::uint32_t>
+std::vector<lyx::char_type>
 utf8_to_ucs4(std::vector<char> const & utf8str);
 
-std::vector<boost::uint32_t>
+std::vector<lyx::char_type>
 ucs2_to_ucs4(std::vector<unsigned short> const & ucs2str);
 
 std::vector<unsigned short>
-ucs4_to_ucs2(std::vector<boost::uint32_t> const & ucs4str);
+ucs4_to_ucs2(std::vector<lyx::char_type> const & ucs4str);
 
 std::vector<unsigned short>
-ucs4_to_ucs2(boost::uint32_t const * s, size_t ls);
+ucs4_to_ucs2(lyx::char_type const * s, size_t ls);
 
 unsigned short
-ucs4_to_ucs2(boost::uint32_t c);
+ucs4_to_ucs2(lyx::char_type c);
 
 std::vector<char>
-ucs4_to_utf8(std::vector<boost::uint32_t> const & ucs4str);
+ucs4_to_utf8(std::vector<lyx::char_type> const & ucs4str);
 
 std::vector<char>
-ucs4_to_utf8(boost::uint32_t c);
+ucs4_to_utf8(lyx::char_type c);
 
 #endif
Index: src/support/docstring.h
===================================================================
--- src/support/docstring.h	(Revision 14882)
+++ src/support/docstring.h	(Arbeitskopie)
@@ -18,8 +18,19 @@
 
 namespace lyx {
 
+/// The type used to hold characters in paragraphs
+#if defined(HAVE_WCHAR_T) && SIZEOF_WCHAR_T == 4
+// Prefer this if possible because GNU libstdc++ has usable std::ctype<wchar_t>
+// locale facets but not std::ctype<boost::uint32_t>. gcc older than 3.4 is
+// also missing usable std::char_traits<boost::uint32_t>.
+typedef wchar_t char_type;
+#else
+// This works on msvc
+typedef boost::uint32_t char_type;
+#endif
+
 /// String type for storing the main text in UCS4 encoding
-typedef std::basic_string<boost::uint32_t> docstring;
+typedef std::basic_string<lyx::char_type> docstring;
 
 /// Creates a docstring from a C string of ASCII characters
 docstring const from_ascii(char const *);
@@ -47,7 +58,7 @@ inline bool operator!=(lyx::docstring co
 /// Compare a C string of ASCII characters with a docstring
 inline bool operator!=(char const * l, lyx::docstring const & r) { return !(r == l); }
 
-#if defined(__GNUC__) && defined(__GNUC_MINOR__) && __GNUC__ == 3 && __GNUC_MINOR__ < 4
+#if 0 && defined(__GNUC__) && defined(__GNUC_MINOR__) && __GNUC__ == 3 && __GNUC_MINOR__ < 4
 // Missing char_traits methods in gcc 3.3 and older. Taken from gcc 4.2svn.
 namespace std {
 
Index: configure.ac
===================================================================
--- configure.ac	(Revision 14882)
+++ configure.ac	(Arbeitskopie)
@@ -144,6 +144,9 @@ AC_SUBST(AIKSAURUS_LIBS)
 
 LYX_USE_INCLUDED_BOOST
 
+# Needed for our char_type
+AC_CHECK_SIZEOF(wchar_t)
+
 ### Setup libtool
 dnl Dirty trick ahead: disable libtool checking for a fortran compiler
 dnl see http://permalink.gmane.org/gmane.comp.gnu.libtool.general/6699
#include <cerrno>
#include <cstdio>
#include <iostream>
#include <fstream>
#include <locale>

#define ICONV_CONST

namespace boost {
	typedef unsigned int uint32_t;
}


// Missing codecvt<boost::uint32_t, char, mbstate_t> for gcc. Does not work.
#ifdef __GNUC__
namespace std {
template<>
class codecvt<boost::uint32_t, char, mbstate_t>
	: public __codecvt_abstract_base<boost::uint32_t, char, mbstate_t>
{
public:
	typedef boost::uint32_t           intern_type;
	typedef char                      extern_type;
	typedef mbstate_t                 state_type;

protected:
#if __GNUC__ != 3 || __GNUC_MINOR__ >= 4
	__c_locale                        _M_c_locale_codecvt;
#endif
public:
	static locale::id                 id;
#if __GNUC__ == 3 && __GNUC_MINOR__ < 4
	explicit codecvt(size_t refs = 0) : __codecvt_abstract_base<boost::uint32_t, char, mbstate_t>(refs) {}
#else
	explicit codecvt(size_t refs = 0) : __codecvt_abstract_base<boost::uint32_t, char, mbstate_t>(refs), _M_c_locale_codecvt(_S_get_c_locale()) {}
#endif
protected:
#if __GNUC__ == 3 && __GNUC_MINOR__ < 4
	virtual ~codecvt() {}
#else
	virtual ~codecvt() { _S_destroy_c_locale(_M_c_locale_codecvt); }
#endif
	virtual result do_out(state_type &, const intern_type *, const intern_type *, const intern_type *&,
			extern_type *, extern_type *, extern_type *&) const
	{
		return error;
	}
	virtual result do_unshift(state_type &, extern_type * to, extern_type *, extern_type *& to_next) const
	{
		to_next = to;
		return noconv;
	}
	virtual result do_in(state_type &, const extern_type *, const extern_type *, const extern_type *&,
			intern_type*, intern_type*, intern_type*&) const
	{
		return error;
	}
	virtual int do_encoding() const throw() { return 0; }
	virtual bool do_always_noconv() const throw() {return false; }
	virtual int do_length(state_type &, const extern_type *, const extern_type *, size_t) const { return 1; }
	virtual int do_max_length() const throw() { return 4; }
};

locale::id codecvt<boost::uint32_t, char, mbstate_t>::id;

}
#endif

// Choose the character type.
namespace lyx {
//	typedef boost::uint32_t char_type;
	typedef wchar_t char_type;
}


// codecvt_facet for conversion of lyx::char_type (internal representation) to UTF8 (external representation)
class utf8_codecvt_facet : public std::codecvt<lyx::char_type, char, std::mbstate_t> {
	typedef std::codecvt<lyx::char_type, char, std::mbstate_t> base;
public:
	explicit utf8_codecvt_facet(size_t refs = 0);
protected:
	virtual ~utf8_codecvt_facet();
	virtual result do_out(state_type &, intern_type const * from,
			intern_type const * from_end, intern_type const *& from_next,
			extern_type * to, extern_type * to_end,
			extern_type *& to_next) const;
	virtual result do_unshift(state_type &, extern_type * to, extern_type *, extern_type *& to_next) const;
	virtual result do_in(state_type &,
			extern_type const * from, extern_type const * from_end,
			extern_type const *& from_next,
			intern_type * to, intern_type * to_end,
			intern_type *& to_next) const;
	virtual int do_encoding();
	virtual bool do_always_noconv();
	virtual int do_length(state_type & state, extern_type const * from, extern_type const * end, size_t max) const;
	virtual int do_max_length() const throw();
private:
	inline base::result do_iconv(iconv_t cd, char const ** from, size_t * inbytesleft, char ** to, size_t * outbytesleft) const
	{
		char const * to_start = *to;
		size_t converted = iconv(cd, const_cast<char ICONV_CONST **>(from), inbytesleft, to, outbytesleft);
		if (converted == (size_t)(-1)) {
			fprintf(stderr, "Error %d returned from iconv: %s\n", errno, strerror(errno));
			switch(errno) {
				case EINVAL:
				case E2BIG:
					fprintf(stderr, "partial result. inbytesleft: %d outbytesleft: %d\n", *inbytesleft, *outbytesleft);
					fflush(stderr);
					return base::partial;
				case EILSEQ:
				default:
					fprintf(stderr, "error result. inbytesleft: %d outbytesleft: %d\n", *inbytesleft, *outbytesleft);
					fflush(stderr);
					return base::error;
			}
		}
		if (*to == to_start)
			return base::noconv;
		return base::ok;
	}
	mutable iconv_t in_cd_;
	mutable iconv_t out_cd_;
};

utf8_codecvt_facet::utf8_codecvt_facet(size_t refs)
	: base(refs), in_cd_((iconv_t)(-1)), out_cd_((iconv_t)(-1))
{
}
utf8_codecvt_facet::~utf8_codecvt_facet()
{
	if (in_cd_ != (iconv_t)(-1))
		if (iconv_close(in_cd_) == -1) {
			fprintf(stderr, "Error %d returned from iconv_close(in_cd_): %s\n", errno, strerror(errno));
			fflush(stderr);
		}
	if (out_cd_ != (iconv_t)(-1))
		if (iconv_close(out_cd_) == -1) {
			fprintf(stderr, "Error %d returned from iconv_close(out_cd_): %s\n", errno, strerror(errno));
			fflush(stderr);
		}

}
utf8_codecvt_facet::result utf8_codecvt_facet::do_out(state_type &, intern_type const * from,
		intern_type const * from_end, intern_type const *& from_next,
		extern_type * to, extern_type * to_end,
		extern_type *& to_next) const
{
	if (out_cd_ == (iconv_t)(-1)) {
		out_cd_ = iconv_open("UTF-8", "UCS-4LE");
		if (out_cd_ == (iconv_t)(-1)) {
			fprintf(stderr, "Error %d returned from iconv_open(out_cd_): %s\n", errno, strerror(errno));
			fflush(stderr);
			throw std::exception();
		}
	}
	size_t inbytesleft = (from_end - from) * sizeof(intern_type);
	size_t outbytesleft = (to_end - to) * sizeof(extern_type);
	from_next = from;
	to_next = to;
	return do_iconv(out_cd_, reinterpret_cast<char const **>(&from_next), &inbytesleft, &to_next, &outbytesleft);
}
utf8_codecvt_facet::result utf8_codecvt_facet::do_unshift(state_type &, extern_type * to, extern_type *, extern_type *& to_next) const
{
	// utf8 does not use shifting
	to_next = to;
	return base::noconv;
}
utf8_codecvt_facet::result utf8_codecvt_facet::do_in(state_type &,
		extern_type const * from, extern_type const * from_end,
		extern_type const *& from_next,
		intern_type * to, intern_type * to_end,
		intern_type *& to_next) const
{
	if (in_cd_ == (iconv_t)(-1)) {
		in_cd_ = iconv_open("UCS-4", "UTF-8");
		if (in_cd_ == (iconv_t)(-1)) {
			fprintf(stderr, "Error %d returned from iconv_open(in_cd_): %s\n", errno, strerror(errno));
			fflush(stderr);
			throw std::exception();
		}
	}
	size_t inbytesleft = (from_end - from) * sizeof(extern_type);
	size_t outbytesleft = (to_end - to) * sizeof(intern_type);
	from_next = from;
	to_next = to;
	return do_iconv(in_cd_, &from_next, &inbytesleft, reinterpret_cast<char **>(&to_next), &outbytesleft);
}
int utf8_codecvt_facet::do_encoding()
{
	return 0;
}
bool utf8_codecvt_facet::do_always_noconv()
{
	return false;
}
int utf8_codecvt_facet::do_length(state_type & /*state*/, extern_type const * from, extern_type const * end, size_t max) const
{
#if 0
	intern_type * to = new intern_type[max];
	intern_type * to_end = to + max;
	intern_type * to_next = to;
	extern_type const * from_next = from;
	do_in(state, from, end, from_next, to, to_end, to_next);
	delete[] to;
	return to_next - to;
#endif
	size_t const length = end - from;
	return std::min(length, max);
}
int utf8_codecvt_facet::do_max_length() const throw()
{
	// UTF8 uses at most 6 bytes to represent one code point
	return 6;
}


int main()
{
	// Without this initial output to cerr the umlauts sent to wcerr below are printed as ?. Why?
	std::cerr << "test:" << std::endl;
	std::locale const utf8(std::locale("C"), new utf8_codecvt_facet);
//	std::cerr << "utf8 has std::codecvt<lyx::char_type, char, std::mbstate_t> facet: " << std::has_facet<std::codecvt<lyx::char_type, char, std::mbstate_t> >(utf8) << std::endl;
//	std::cerr << "utf8 has utf8_codecvt_facet facet: " << std::has_facet<utf8_codecvt_facet>(utf8) << std::endl;
	std::basic_ofstream<lyx::char_type> os;
	os.imbue(utf8);
	os.open("stream.out");
	os << L" abc" << lyx::char_type('d') << L'\n';
//	std::wcerr << L" abc" << lyx::char_type('d') << L'\n';
	std::wcerr << L" abcd\n";
	lyx::char_type ae = 0xe4;
	lyx::char_type oe = 0xf6;
	lyx::char_type ue = 0xfc;
	lyx::char_type Ae = 0xc4;
	lyx::char_type Oe = 0xd6;
	lyx::char_type Ue = 0xdc;
	os << L' ' << ae << oe << ue << L'\n';
	std::wcerr << L' ' << ae << oe << ue << L'\n';
	std::cerr << std::hex << L' ' << ' ' << ae << ' ' << oe << ' ' << ue << '\n';
	os << L' ' << Ae << Oe << Ue << L'\n';
	std::wcerr << L' ' << Ae << Oe << Ue << L'\n';
	std::cerr << std::hex << L' ' << ' ' << Ae << ' ' << Oe << ' ' << Ue << '\n';
	os << L" abcd\n";
	std::wcerr << L" abcd\n";
	return 0;
}

Reply via email to