Re: [Patch] optimize ucs4 to local conversion

Georg Baum Tue, 15 May 2007 07:29:09 -0700

Abdelrazak Younes wrote:

> Abdelrazak Younes wrote:
>> If you try out the document attached in bug 3561
>> (http://bugzilla.lyx.org/show_bug.cgi?id=3561) and View->Source, the
>> Encoding::init() will take 40 seconds on my system.
>> 
>> With the attached patch, this goes down to 25 seconds.


Your patch (+ the part that is already in) has several problems:

- char ucs4_to_eightbit(char_type ucs4, string const & encoding) does not
make sense at all. You cannot guarantee that the result is only one char
(besides it is unused).

- The name of ucs4_to_multibytes is misleading: This function does exactly
the same as ucs4_to_eightbit, only optimized for one UCS4 char

- Now there are two maps with ucs4 -> 8bit iconv processors, one is enough
and more efficient.

- ucs4_to_multibytes silently fails for exotic conversions that result in
more than 4 bytes. AFAIK LyX currently doesn't support such an encoding,
but AFAIK some exist and they could be supported in the future.

- there is no reason not to use the optimized map lookup in
eightbit_to_ucs4, too.

Consider the attached (untested) version if you like.


Georg

Index: src/support/unicode.cpp
===================================================================
--- src/support/unicode.cpp	(Revision 18336)
+++ src/support/unicode.cpp	(Arbeitskopie)
@@ -288,59 +288,54 @@ vector<char_type>
 eightbit_to_ucs4(char const * s, size_t ls, string const & encoding)
 {
 	static map<string, IconvProcessor> processors;
-	if (processors.find(encoding) == processors.end()) {
+	map<string, IconvProcessor>::iterator it = processors.find(encoding);
+	if (it == processors.end()) {
 		IconvProcessor processor(ucs4_codeset, encoding.c_str());
-		processors.insert(make_pair(encoding, processor));
+		it = processors.insert(make_pair(encoding, processor)).first;
 	}
-	return iconv_convert<char_type>(processors[encoding], s, ls);
+	return iconv_convert<char_type>(it->second, s, ls);
 }
 
 
-vector<char>
-ucs4_to_eightbit(char_type const * ucs4str, size_t ls, string const & encoding)
+namespace {
+
+/// processors for UCS4 -> 8bit encoding conversions
+map<string, IconvProcessor> ucs4_processors;
+
+
+/// Get processor for UCS4 -> \p encoding conversion
+inline IconvProcessor & get_ucs4_processor(string const & encoding)
 {
-	static map<string, IconvProcessor> processors;
-	if (processors.find(encoding) == processors.end()) {
+	map<string, IconvProcessor>::iterator it = ucs4_processors.find(encoding);
+	if (it == ucs4_processors.end()) {
 		IconvProcessor processor(encoding.c_str(), ucs4_codeset);
-		processors.insert(make_pair(encoding, processor));
+		return ucs4_processors.insert(make_pair(encoding, processor)).first->second;
 	}
-	return iconv_convert<char>(processors[encoding], ucs4str, ls);
+	return it->second;
 }
 
+}
 
-char ucs4_to_eightbit(char_type ucs4, string const & encoding)
-{
-	static map<string, IconvProcessor> processors;
-	map<string, IconvProcessor>::iterator it = processors.find(encoding);
-	if (it == processors.end()) {
-		IconvProcessor processor(encoding.c_str(), ucs4_codeset);
-		it = processors.insert(make_pair(encoding, processor)).first;
-	}
 
-	char out;
-	int const bytes = it->second.convert((char *)(&ucs4), 4, &out, 1);
-	if (bytes > 0)
-		return out;
-	return 0;
+vector<char>
+ucs4_to_eightbit(char_type const * ucs4str, size_t ls, string const & encoding)
+{
+	return iconv_convert<char>(get_ucs4_processor(encoding), ucs4str, ls);
 }
 
 
-void ucs4_to_multibytes(char_type ucs4, vector<char> & out,
+void ucs4_to_eightbit(char_type ucs4, vector<char> & out,
 	string const & encoding)
 {
-	static map<string, IconvProcessor> processors;
-	map<string, IconvProcessor>::iterator it = processors.find(encoding);
-	if (it == processors.end()) {
-		IconvProcessor processor(encoding.c_str(), ucs4_codeset);
-		it = processors.insert(make_pair(encoding, processor)).first;
-	}
 
 	out.resize(4);
-	int bytes = it->second.convert((char *)(&ucs4), 4, &out[0], 4);
-	if (bytes > 0)
+	int bytes = get_ucs4_processor(encoding).convert((char *)(&ucs4), 4, &out[0], 4);
+	if (bytes >= 0)
 		out.resize(bytes);
 	else
-		out.clear();
+		// Use unoptimized version.
+		// Does only happen for exotic encodings
+		out = ucs4_to_eightbit(&ucs4, 1, encoding);
 }
 
 } // namespace lyx
Index: src/support/unicode.h
===================================================================
--- src/support/unicode.h	(Revision 18336)
+++ src/support/unicode.h	(Arbeitskopie)
@@ -89,12 +89,9 @@ eightbit_to_ucs4(char const * s, size_t 
 std::vector<char>
 ucs4_to_eightbit(char_type const * ucs4str, size_t ls, std::string const & encoding);
 
-/// convert ucs4 character \p c to encoding \p encoding.
+/// convert ucs4 character \p ucs4 to encoding \p encoding.
 /// \p encoding must be a valid iconv 8bit encoding
-char ucs4_to_eightbit(char_type c, std::string const & encoding);
-
-///
-void ucs4_to_multibytes(char_type ucs4, std::vector<char> & out,
+void ucs4_to_eightbit(char_type ucs4, std::vector<char> & out,
 	std::string const & encoding);
 
 extern char const * ucs4_codeset;
Index: src/Encoding.cpp
===================================================================
--- src/Encoding.cpp	(Revision 18336)
+++ src/Encoding.cpp	(Arbeitskopie)
@@ -171,8 +171,9 @@ void Encoding::init() const
 		// they do not have a direct representation as a single byte,
 		// therefore we need to check all UCS4 code points.
 		// This is expensive!
+		std::vector<char> eightbit;
 		for (char_type c = 0; c < max_ucs4; ++c) {
-			std::vector<char> const eightbit = ucs4_to_eightbit(&c, 1, iconvName_);
+			ucs4_to_eightbit(c, eightbit, iconvName_);
 			if (!eightbit.empty()) {
 				CharInfoMap::const_iterator const it = unicodesymbols.find(c);
 				if (it == unicodesymbols.end() || !it->second.force)

Re: [Patch] optimize ucs4 to local conversion

Reply via email to