[patch] wide streams - finally working

Georg Baum Thu, 14 Sep 2006 02:46:53 -0700

With these two patches I am able to export de_Userguide.lyx to utf8 plain
text without problems. The interesting part is in x1.diff: Define wide file
and string streams. The file stream converts to UTF8 on the fly, and should
also be used for docbook later. x2.diff is mostly boring, it adds some new
utf8 conversions, but also removes some.
It works on linux, and according to the tests done by Peter and Abdel it
should also work with MSVC.


Concerning cygwin: At least a more complete ctype facet (with do_is etc.) is
needed there, and also a num_put facet. More locale facets might be needed
if these are implemented and working.
The ctype facet I posted yesterday could serve as a starting point, but the
used wstring functions from the C library would need to be replaced.

Can this go in?


Georg

Index: src/support/os_win32.C
===================================================================
--- src/support/os_win32.C	(Revision 14992)
+++ src/support/os_win32.C	(Arbeitskopie)
@@ -24,6 +24,7 @@
 #include <boost/assert.hpp>
 
 #include <cstdlib>
+#include <locale>
 #include <vector>
 
 #include <string>
@@ -80,10 +81,65 @@ bool windows_style_tex_paths_ = true;
 
 string cygdrive = "/cygdrive";
 
+/// ctype facet for UCS4 streams. Widening and narrowing is restricted to
+/// ASCII, since we do not need anything else.
+class ascii_ctype_facet : public std::ctype<lyx::char_type>
+{
+public:
+	typedef lyx::char_type char_type;
+public:
+	explicit ascii_ctype_facet(size_t refs = 0) : std::ctype<char_type>(refs) {}
+protected:
+	virtual ~ascii_ctype_facet() {}
+	virtual char_type do_widen(char c) const
+	{
+		if (static_cast<unsigned char>(c) < 128)
+			return c;
+		throw std::bad_cast();
+	}
+	virtual const char* do_widen(const char* lo, const char* hi,
+			char_type* dest) const
+	{
+		while (lo < hi) {
+			if (static_cast<unsigned char>(*lo) >= 128)
+				throw std::bad_cast();
+			*dest = *lo;
+			++lo;
+			++dest;
+		}
+		return hi;
+	}
+	virtual char do_narrow(char_type wc, char) const
+	{
+		if (wc < 128)
+			return static_cast<char>(wc);
+		throw std::bad_cast();
+	}
+	virtual char_type const * do_narrow(char_type const * lo,
+			char_type const * hi, char, char * dest) const
+	{
+		while (lo < hi) {
+			if (*lo < 128)
+				*dest = static_cast<char>(*lo);
+			else
+				throw std::bad_cast();
+			++lo;
+			++dest;
+		}
+		return hi;
+	}
+};
+
 } // namespace anon
 
 void init(int /* argc */, char * argv[])
 {
+	// Set a usable ctype facet for our docstreams. Without this we
+	// are not able to output narrow characters.
+	std::locale global;
+	std::locale locale(global, new ascii_ctype_facet);
+	std::locale::global(locale);
+
 	/* Note from Angus, 17 Jan 2005:
 	 *
 	 * The code below is taken verbatim from Ruurd's original patch
Index: src/support/docstream.C
===================================================================
--- src/support/docstream.C	(Revision 0)
+++ src/support/docstream.C	(Revision 0)
@@ -0,0 +1,217 @@
+/**
+ * \file docstream.C
+ * This file is part of LyX, the document processor.
+ * Licence details can be found in the file COPYING.
+ *
+ * \author Georg Baum
+ *
+ * Full author contact details are available in file CREDITS.
+ */
+
+#include <config.h>
+
+#include "docstream.h"
+
+#include <cerrno>
+#include <cstdio>
+#include <iconv.h>
+#include <locale>
+
+namespace {
+
+#ifdef WORDS_BIGENDIAN
+char const * ucs4_codeset = "UCS-4BE";
+#else
+char const * ucs4_codeset = "UCS-4LE";
+#endif
+char const * utf8_codeset = "UTF-8";
+
+// We use C IO throughout this file, because the facets might be used with
+// lyxerr in the future.
+
+/// codecvt facet for conversion of UCS4 (internal representation) to UTF8
+/// (external representation) or vice versa
+class utf8_codecvt_facet : public std::codecvt<lyx::char_type, char, std::mbstate_t>
+{
+	typedef std::codecvt<lyx::char_type, char, std::mbstate_t> base;
+public:
+	/// Constructor. You have to specify with \p inout whether you want
+	/// to use this facet only for input, only for output or for both.
+	explicit utf8_codecvt_facet(std::ios_base::openmode inout = std::ios_base::in | std::ios_base::out,
+			size_t refs = 0)
+		: base(refs)
+	{
+		if (inout & std::ios_base::in) {
+			in_cd_ = iconv_open(ucs4_codeset, utf8_codeset);
+			if (in_cd_ == (iconv_t)(-1)) {
+				fprintf(stderr, "Error %d returned from iconv_open(in_cd_): %s\n",
+				        errno, strerror(errno));
+				fflush(stderr);
+				throw std::exception();
+			}
+		} else
+			in_cd_ = (iconv_t)(-1);
+		if (inout & std::ios_base::out) {
+			out_cd_ = iconv_open(utf8_codeset, ucs4_codeset);
+			if (out_cd_ == (iconv_t)(-1)) {
+				fprintf(stderr, "Error %d returned from iconv_open(out_cd_): %s\n",
+				        errno, strerror(errno));
+				fflush(stderr);
+				throw std::exception();
+			}
+		} else
+			out_cd_ = (iconv_t)(-1);
+	}
+protected:
+	virtual ~utf8_codecvt_facet()
+	{
+		if (in_cd_ != (iconv_t)(-1))
+			if (iconv_close(in_cd_) == -1) {
+				fprintf(stderr, "Error %d returned from iconv_close(in_cd_): %s\n",
+				        errno, strerror(errno));
+				fflush(stderr);
+			}
+		if (out_cd_ != (iconv_t)(-1))
+			if (iconv_close(out_cd_) == -1) {
+				fprintf(stderr, "Error %d returned from iconv_close(out_cd_): %s\n",
+				        errno, strerror(errno));
+				fflush(stderr);
+			}
+	}
+	virtual result do_out(state_type &, intern_type const * from,
+			intern_type const * from_end, intern_type const *& from_next,
+			extern_type * to, extern_type * to_end,
+			extern_type *& to_next) const
+	{
+		size_t inbytesleft = (from_end - from) * sizeof(intern_type);
+		size_t outbytesleft = (to_end - to) * sizeof(extern_type);
+		from_next = from;
+		to_next = to;
+		return do_iconv(out_cd_, reinterpret_cast<char const **>(&from_next),
+				&inbytesleft, &to_next, &outbytesleft);
+	}
+	virtual result do_unshift(state_type &, extern_type * to,
+			extern_type *, extern_type *& to_next) const
+	{
+		// utf8 does not use shifting
+		to_next = to;
+		return base::noconv;
+	}
+	virtual result do_in(state_type &,
+			extern_type const * from, extern_type const * from_end,
+			extern_type const *& from_next,
+			intern_type * to, intern_type * to_end,
+			intern_type *& to_next) const
+	{
+		size_t inbytesleft = (from_end - from) * sizeof(extern_type);
+		size_t outbytesleft = (to_end - to) * sizeof(intern_type);
+		from_next = from;
+		to_next = to;
+		return do_iconv(in_cd_, &from_next, &inbytesleft,
+				reinterpret_cast<char **>(&to_next),
+				&outbytesleft);
+	}
+	virtual int do_encoding() const throw()
+	{
+		return 0;
+	}
+	virtual bool do_always_noconv() const throw()
+	{
+		return false;
+	}
+	virtual int do_length(state_type & /*state*/, extern_type const * from,
+			extern_type const * end, size_t max) const
+	{
+#if 0
+		// It seems we should do this:
+		intern_type * to = new intern_type[max];
+		intern_type * to_end = to + max;
+		intern_type * to_next = to;
+		extern_type const * from_next = from;
+		do_in(state, from, end, from_next, to, to_end, to_next);
+		delete[] to;
+		return to_next - to;
+#endif
+		// But since that is expensive we are lazy:
+		size_t const length = end - from;
+		return std::min(length, max);
+	}
+	virtual int do_max_length() const throw()
+	{
+		// UTF8 uses at most 6 bytes to represent one code point
+		return 6;
+	}
+private:
+	/// Do the actual conversion. The interface is equivalent to that of
+	/// iconv() (but const correct).
+	inline base::result do_iconv(iconv_t cd, char const ** from,
+			size_t * inbytesleft, char ** to, size_t * outbytesleft) const
+	{
+		char const * to_start = *to;
+		size_t converted = iconv(cd, const_cast<char ICONV_CONST **>(from),
+				inbytesleft, to, outbytesleft);
+		if (converted == (size_t)(-1)) {
+			switch(errno) {
+			case EINVAL:
+			case E2BIG:
+				return base::partial;
+			case EILSEQ:
+			default:
+				fprintf(stderr, "Error %d returned from iconv: %s\n",
+				        errno, strerror(errno));
+				fflush(stderr);
+				return base::error;
+			}
+		}
+		if (*to == to_start)
+			return base::noconv;
+		return base::ok;
+	}
+	iconv_t in_cd_;
+	iconv_t out_cd_;
+};
+
+}
+
+
+namespace lyx {
+
+
+idocfstream::idocfstream() : base()
+{
+	std::locale global;
+	std::locale locale(global, new utf8_codecvt_facet(in));
+	imbue(locale);
+}
+
+	
+idocfstream::idocfstream(const char* s, std::ios_base::openmode mode)
+	: base()
+{
+	// We must imbue the stream before openening the file
+	std::locale global;
+	std::locale locale(global, new utf8_codecvt_facet(in));
+	imbue(locale);
+	open(s, mode);
+}
+
+
+odocfstream::odocfstream() : base()
+{
+	std::locale global;
+	std::locale locale(global, new utf8_codecvt_facet(out));
+	imbue(locale);
+}
+
+	
+odocfstream::odocfstream(const char* s, std::ios_base::openmode mode)
+	: base()
+{
+	// We must imbue the stream before openening the file
+	std::locale global;
+	std::locale locale(global, new utf8_codecvt_facet(out));
+	imbue(locale);
+	open(s, mode);
+}
+
+}
Index: src/support/docstream.h
===================================================================
--- src/support/docstream.h	(Revision 0)
+++ src/support/docstream.h	(Revision 0)
@@ -0,0 +1,66 @@
+// -*- C++ -*-
+/**
+ * \file docstream.h
+ * This file is part of LyX, the document processor.
+ * Licence details can be found in the file COPYING.
+ *
+ * \author Georg Baum
+ *
+ * Full author contact details are available in file CREDITS.
+ */
+
+#ifndef LYX_DOCSTREAM_H
+#define LYX_DOCSTREAM_H
+
+#include "support/docstring.h"
+
+#include <fstream>
+#include <sstream>
+
+namespace lyx {
+
+/// Base class for UCS4 input streams
+typedef std::basic_istream<char_type> idocstream;
+
+/** Base class for UCS4 output streams.
+    If you want to output a single UCS4 character, use \code
+    os.put(c);
+    \endcode, not \code
+    os << c;
+    \endcode . The latter will not output the character, but the code point
+    as number. This is because we can't overload operator<< (our character
+    type is not a real type but a typedef). Narrow characters of type char
+    can be output as usual.
+ */
+typedef std::basic_ostream<char_type> odocstream;
+
+/// File stream for reading UTF8-encoded files with automatic conversion to
+/// UCS4.
+class idocfstream : public std::basic_ifstream<char_type> {
+	typedef std::basic_ifstream<char_type> base;
+public:
+	idocfstream();
+	explicit idocfstream(const char* s,
+		std::ios_base::openmode mode = std::ios_base::in);
+	~idocfstream() {}
+};
+
+/// File stream for writing UTF8-encoded files with automatic conversion from
+/// UCS4.
+class odocfstream : public std::basic_ofstream<char_type> {
+	typedef std::basic_ofstream<char_type> base;
+public:
+	odocfstream();
+	explicit odocfstream(const char* s,
+		std::ios_base::openmode mode = std::ios_base::out|std::ios_base::trunc);
+	~odocfstream() {}
+};
+
+/// UCS4 input stringstream
+typedef std::basic_istringstream<char_type> idocstringstream;
+
+/// UCS4 output stringstream
+typedef std::basic_ostringstream<char_type> odocstringstream;
+
+}
+#endif
Index: src/support/Makefile.am
===================================================================
--- src/support/Makefile.am	(Revision 14992)
+++ src/support/Makefile.am	(Arbeitskopie)
@@ -27,6 +27,8 @@ libsupport_la_SOURCES = \
 	copied_ptr.h \
 	cow_ptr.h \
 	debugstream.h \
+	docstream.C \
+	docstream.h \
 	docstring.C \
 	docstring.h \
 	environment.h \
Index: development/scons/scons_manifest.py
===================================================================
--- development/scons/scons_manifest.py	(Revision 14992)
+++ development/scons/scons_manifest.py	(Arbeitskopie)
@@ -96,6 +96,7 @@ src_support_header_files = Split('''
     copied_ptr.h
     cow_ptr.h
     debugstream.h
+    docstream.h
     docstring.h
     environment.h
     filefilterlist.h
@@ -133,6 +134,7 @@ src_support_files = Split('''
     chdir.C
     convert.C
     copy.C
+    docstream.C
     docstring.C
     environment.C
     filefilterlist.C

x2.diff.bz2
Description: BZip2 compressed data

[patch] wide streams - finally working

Reply via email to