Hello,
This patch brings a 20% improvement when loading a big document (my test
document is the UserGuide copied&Pasted 3 times). This is done by
avoiding multiple string/vector/docstring copying.
I can think of many other optimisation but I would like this one to go
in first. IMHO, most of unicode can go and only one iconv_convert()
should stay.
Objections?
Abdel.
Index: lyxlex_pimpl.C
===================================================================
--- lyxlex_pimpl.C (revision 15581)
+++ lyxlex_pimpl.C (working copy)
@@ -70,15 +70,13 @@
string const LyXLex::Pimpl::getString() const
{
- return string(buff.begin(), buff.end());
+ return buff;
}
docstring const LyXLex::Pimpl::getDocString() const
{
- std::vector<char_type> res = utf8_to_ucs4(buff);
- docstring dstr(res.begin(), res.end());
- return dstr;
+ return from_utf8(buff);
}
@@ -206,13 +204,12 @@
// we extract the first word and leaves the rest
// in pushTok. (Lgb)
if (pushTok.find(' ') != string::npos && pushTok[0] == '\\') {
- string tmp;
- pushTok = split(pushTok, tmp, ' ');
- buff.assign(tmp.begin(), tmp.end());
+ buff.clear();
+ pushTok = split(pushTok, buff, ' ');
return true;
} else {
- buff.assign(pushTok.begin(), pushTok.end());
- pushTok.erase();
+ buff = pushTok;
+ pushTok.clear();
return true;
}
}
@@ -256,7 +253,7 @@
++lineno;
}
- buff.pop_back();
+ buff.resize(buff.size()-1);
status = LEX_DATA;
break;
}
@@ -377,7 +374,7 @@
++lineno;
}
- buff.pop_back();
+ buff.resize(buff.size() -1);
status = LEX_DATA;
break;
}
@@ -456,7 +453,7 @@
if (c == '\n') {
++lineno;
- buff.pop_back();
+ buff.resize(buff.size() - 1);
status = LEX_DATA;
return true;
} else {
@@ -472,13 +469,12 @@
// we extract the first word and leaves the rest
// in pushTok. (Lgb)
if (pushTok.find(' ') != string::npos && pushTok[0] == '\\') {
- string tmp;
- pushTok = split(pushTok, tmp, ' ');
- buff.assign(tmp.begin(), tmp.end());
+ buff.clear();
+ pushTok = split(pushTok, buff, ' ');
return true;
} else {
- buff.assign(pushTok.begin(), pushTok.end());
- pushTok.erase();
+ buff = pushTok;
+ pushTok.clear();
return true;
}
}
Index: lyxlex_pimpl.h
===================================================================
--- lyxlex_pimpl.h (revision 15581)
+++ lyxlex_pimpl.h (working copy)
@@ -81,7 +81,7 @@
///
int no_items;
///
- std::vector<char> buff;
+ std::string buff;
///
int status;
///
Index: support/docstring.C
===================================================================
--- support/docstring.C (revision 15581)
+++ support/docstring.C (working copy)
@@ -20,6 +20,15 @@
namespace lyx {
+extern int iconv_convert(int & cd,
+ char const * tocode,
+ char const * fromcode,
+ char const * buf,
+ size_t buflen,
+ char * outbuf,
+ size_t maxoutsize);
+
+
docstring const from_ascii(char const * ascii)
{
docstring s;
@@ -53,11 +62,28 @@
}
+void utf8_to_ucs4(std::string const & utf8, docstring & ucs4)
+{
+ size_t n = utf8.size();
+ ucs4.resize(n);
+ if (n == 0)
+ return;
+
+ int maxoutsize = n * 4;
+ int cd = -1;
+ char * outbuf = (char *)(&(ucs4[0]));
+ int bytes = iconv_convert(cd, ucs4_codeset, "UTF-8",
+ utf8.c_str(), n, outbuf, maxoutsize);
+
+ ucs4.resize(bytes/4);
+}
+
+
docstring const from_utf8(std::string const & utf8)
{
- std::vector<lyx::char_type> const ucs4 =
- utf8_to_ucs4(utf8.data(), utf8.size());
- return docstring(ucs4.begin(), ucs4.end());
+ docstring ucs4;
+ utf8_to_ucs4(utf8, ucs4);
+ return ucs4;
}
Index: support/unicode.C
===================================================================
--- support/unicode.C (revision 15581)
+++ support/unicode.C (working copy)
@@ -35,22 +35,20 @@
char const * ucs2_codeset = "UCS-2LE";
#endif
-namespace {
-
-template<typename RetType, typename InType>
-std::vector<RetType>
-iconv_convert(iconv_t * cd,
+int iconv_convert(int & cd,
char const * tocode,
char const * fromcode,
- InType const * buf,
- size_t buflen)
+ char const * buf,
+ size_t buflen,
+ char * outbuf,
+ size_t maxoutsize)
{
if (buflen == 0)
- return std::vector<RetType>();
+ return 0;
- if (*cd == (iconv_t)(-1)) {
- *cd = iconv_open(tocode, fromcode);
- if (*cd == (iconv_t)(-1)) {
+ if (cd == -1) {
+ cd = (int)(iconv_open(tocode, fromcode));
+ if (cd == -1) {
lyxerr << "Error returned from iconv_open" << endl;
switch (errno) {
case EINVAL:
@@ -66,17 +64,13 @@
}
}
- char ICONV_CONST * inbuf = const_cast<char ICONV_CONST
*>(reinterpret_cast<char const *>(buf));
- size_t inbytesleft = buflen * sizeof(InType);
- // The preamble of the user guide is more than 11.500 characters, so we
go for 32kb
- size_t const outsize = 32768;
- static char out[outsize];
- char * outbuf = out;
- size_t outbytesleft = outsize;
+ char ICONV_CONST * inbuf = const_cast<char ICONV_CONST *>(buf);
+ size_t inbytesleft = buflen;
+ size_t outbytesleft = maxoutsize;
- size_t res = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
+ int res = iconv((iconv_t)(cd), &inbuf, &inbytesleft, &outbuf,
&outbytesleft);
- if (res == (size_t)(-1)) {
+ if (res == -1) {
lyxerr << "Error returned from iconv" << endl;
switch (errno) {
case E2BIG:
@@ -111,18 +105,44 @@
break;
}
// We got an error so we close down the conversion engine
- if (iconv_close(*cd) == -1) {
+ if (iconv_close((iconv_t)(cd)) == -1) {
lyxerr << "Error returned from iconv_close("
<< errno << ")" << endl;
}
- *cd = (iconv_t)(-1);
+ cd = -1;
}
//lyxerr << std::dec;
//lyxerr << "Inbytesleft: " << inbytesleft << endl;
//lyxerr << "Outbytesleft: " << outbytesleft << endl;
- int bytes = outsize - outbytesleft;
+ return maxoutsize - outbytesleft;
+}
+
+
+namespace {
+
+
+template<typename RetType, typename InType>
+std::vector<RetType>
+iconv_convert(int & cd,
+ char const * tocode,
+ char const * fromcode,
+ InType const * buf,
+ size_t buflen)
+{
+ if (buflen == 0)
+ return std::vector<RetType>();
+
+ char const * inbuf = reinterpret_cast<char const *>(buf);
+ size_t inbytesleft = buflen * sizeof(InType);
+
+ size_t const outsize = 32768;
+ static char out[outsize];
+ char * outbuf = out;
+
+ int bytes = lyx::iconv_convert(cd, tocode, fromcode, inbuf,
inbytesleft, outbuf, outsize);
+
RetType const * tmp = reinterpret_cast<RetType const *>(out);
return std::vector<RetType>(tmp, tmp + bytes / sizeof(RetType));
}
@@ -142,8 +162,8 @@
std::vector<lyx::char_type>
utf8_to_ucs4(char const * utf8str, size_t ls)
{
- static iconv_t cd = (iconv_t)(-1);
- return iconv_convert<lyx::char_type>(&cd, ucs4_codeset, "UTF-8",
+ static int cd = -1;
+ return iconv_convert<lyx::char_type>(cd, ucs4_codeset, "UTF-8",
utf8str, ls);
}
@@ -168,8 +188,8 @@
std::vector<lyx::char_type>
ucs2_to_ucs4(unsigned short const * ucs2str, size_t ls)
{
- static iconv_t cd = (iconv_t)(-1);
- return iconv_convert<lyx::char_type>(&cd, ucs4_codeset, ucs2_codeset,
+ static int cd = -1;
+ return iconv_convert<lyx::char_type>(cd, ucs4_codeset, ucs2_codeset,
ucs2str, ls);
}
@@ -194,8 +214,8 @@
std::vector<unsigned short>
ucs4_to_ucs2(lyx::char_type const * s, size_t ls)
{
- static iconv_t cd = (iconv_t)(-1);
- return iconv_convert<unsigned short>(&cd, ucs2_codeset, ucs4_codeset,
+ static int cd = -1;
+ return iconv_convert<unsigned short>(cd, ucs2_codeset, ucs4_codeset,
s, ls);
}
@@ -203,8 +223,8 @@
std::vector<char>
ucs4_to_utf8(lyx::char_type c)
{
- static iconv_t cd = (iconv_t)(-1);
- return iconv_convert<char>(&cd, "UTF-8", ucs4_codeset, &c, 1);
+ static int cd = -1;
+ return iconv_convert<char>(cd, "UTF-8", ucs4_codeset, &c, 1);
}
@@ -221,8 +241,8 @@
std::vector<char>
ucs4_to_utf8(lyx::char_type const * ucs4str, size_t ls)
{
- static iconv_t cd = (iconv_t)(-1);
- return iconv_convert<char>(&cd, "UTF-8", ucs4_codeset,
+ static int cd = -1;
+ return iconv_convert<char>(cd, "UTF-8", ucs4_codeset,
ucs4str, ls);
}
@@ -230,10 +250,10 @@
std::vector<lyx::char_type>
eightbit_to_ucs4(char const * s, size_t ls, std::string const & encoding)
{
- static std::map<std::string, iconv_t> cd;
+ static std::map<std::string, int> cd;
if (cd.find(encoding) == cd.end())
- cd[encoding] = (iconv_t)(-1);
- return iconv_convert<char_type>(&cd[encoding], ucs4_codeset,
+ cd[encoding] = -1;
+ return iconv_convert<char_type>(cd[encoding], ucs4_codeset,
encoding.c_str(), s, ls);
}
@@ -241,10 +261,10 @@
std::vector<char>
ucs4_to_eightbit(lyx::char_type const * ucs4str, size_t ls, std::string const
& encoding)
{
- static std::map<std::string, iconv_t> cd;
+ static std::map<std::string, int> cd;
if (cd.find(encoding) == cd.end())
- cd[encoding] = (iconv_t)(-1);
- return iconv_convert<char>(&cd[encoding], encoding.c_str(),
+ cd[encoding] = -1;
+ return iconv_convert<char>(cd[encoding], encoding.c_str(),
ucs4_codeset, ucs4str, ls);
}