[PATCH] Speed up parsing of documents

Jean-Marc Lasgouttes Wed, 27 Mar 2002 15:22:06 -0800

Here is a small experimentation I did. It is known that parsing of
very large files is slowed down by the fact that we do a whole lot
of string compares instead of a good old binary search like in lyxlex.


Typically, when loading a big file from Herbert,
Buffer::parseSingleLyXformat2Token takes 25.6% of the time,
and a big part of that is due to 2114004 strings comparisons.

It occured to me that this could be alleviated by reordering the tokens
in the chained if:s. I whipped the following simple script

--counttokens----
#!/bin/sh

grep '^\\' $1 | sed -e 's/^\(\\[^ ]*\) .*$/\1/'|sort|uniq -c|sort -k1,1nr
--------

And running it on the book, I get
ginette: ./counttokens fichiers/Versuch.lyx
    7767 \layout
    6239 \begin_inset
    6239 \end_inset
    5138 \latex
    4841 \backslash
    2638 \emph
    2555 \family
    2540 \noun
    2537 \color
    2534 \bar
    2534 \shape
    2420 \size
    1733 \series
     277 \begin_float
     277 \end_float
[..]

Therefore I moved the fonts, insets and backslash (useful for ERT) tests
to the top of the method.

The result is that Buffer::parseSingleLyXformat2Token takes 16.2% of the
time, and the 404503 string comparisons now take less time than
InsertChar, which makes more sense.

I attach the patch instead of commiting it, because I want to have
a virtual nod from Lars before. I would not want to break the carefully
crafted compatibility reading...

Note that this is tested with lyxstring. Results with normal strings
may be completely different...

JMarc

Index: src/ChangeLog
===================================================================
RCS file: /usr/local/lyx/cvsroot/lyx-devel/src/ChangeLog,v
retrieving revision 1.651
diff -u -r1.651 ChangeLog
--- src/ChangeLog       27 Mar 2002 23:27:12 -0000      1.651
+++ src/ChangeLog       27 Mar 2002 23:29:00 -0000
@@ -1,3 +1,9 @@
+2002-03-28  Jean-Marc Lasgouttes  <[EMAIL PROTECTED]>
+
+       * buffer.C (parseSingleLyXformat2Token): reorder a bit the tests
+       in order to reduce drastically the number of comparisons needed to
+       parse a large document
+
 2002-03-27  Jean-Marc Lasgouttes  <[EMAIL PROTECTED]>
 
        * lyxfunc.C (getStatus): return 'disabled' early for LFUN_NOACTION
Index: src/buffer.C
===================================================================
RCS file: /usr/local/lyx/cvsroot/lyx-devel/src/buffer.C,v
retrieving revision 1.322
diff -u -r1.322 buffer.C
--- src/buffer.C        25 Mar 2002 11:15:25 -0000      1.322
+++ src/buffer.C        27 Mar 2002 23:29:11 -0000
@@ -564,6 +564,102 @@
 #endif
 
 #ifndef NO_COMPABILITY
+       } else if (token == "\\end_inset") {
+               lyxerr << "Solitary \\end_inset. Missing \\begin_inset?.\n"
+                      << "Last inset read was: " << last_inset_read
+                      << endl;
+               // Simply ignore this. The insets do not have
+               // to read this.
+               // But insets should read it, it is a part of
+               // the inset isn't it? Lgb.
+       } else if (token == "\\begin_inset") {
+#ifndef NO_COMPABILITY
+               insertErtContents(par, pos, false);
+               ert_stack.push(ert_comp);
+               ert_comp = ErtComp();
+#endif
+               readInset(lex, par, pos, font);
+#ifndef NO_COMPABILITY
+               ert_comp = ert_stack.top();
+               ert_stack.pop();
+               insertErtContents(par, pos);
+#endif
+       } else if (token == "\\family") {
+               lex.next();
+               font.setLyXFamily(lex.getString());
+       } else if (token == "\\series") {
+               lex.next();
+               font.setLyXSeries(lex.getString());
+       } else if (token == "\\shape") {
+               lex.next();
+               font.setLyXShape(lex.getString());
+       } else if (token == "\\size") {
+               lex.next();
+               font.setLyXSize(lex.getString());
+#ifndef NO_COMPABILITY
+       } else if (token == "\\latex") {
+               lex.next();
+               string const tok = lex.getString();
+               if (tok == "no_latex") {
+                       // Do the insetert.
+                       insertErtContents(par, pos);
+               } else if (tok == "latex") {
+                       ert_comp.active = true;
+                       ert_comp.font = font;
+               } else if (tok == "default") {
+                       // Do the insetert.
+                       insertErtContents(par, pos);
+               } else {
+                       lex.printError("Unknown LaTeX font flag "
+                                      "`$$Token'");
+               }
+#endif
+       } else if (token == "\\lang") {
+               lex.next();
+               string const tok = lex.getString();
+               Language const * lang = languages.getLanguage(tok);
+               if (lang) {
+                       font.setLanguage(lang);
+               } else {
+                       font.setLanguage(params.language);
+                       lex.printError("Unknown language `$$Token'");
+               }
+       } else if (token == "\\numeric") {
+               lex.next();
+               font.setNumber(font.setLyXMisc(lex.getString()));
+       } else if (token == "\\emph") {
+               lex.next();
+               font.setEmph(font.setLyXMisc(lex.getString()));
+       } else if (token == "\\bar") {
+               lex.next();
+               string const tok = lex.getString();
+               // This is dirty, but gone with LyX3. (Asger)
+               if (tok == "under")
+                       font.setUnderbar(LyXFont::ON);
+               else if (tok == "no")
+                       font.setUnderbar(LyXFont::OFF);
+               else if (tok == "default")
+                       font.setUnderbar(LyXFont::INHERIT);
+               else
+                       lex.printError("Unknown bar font flag "
+                                      "`$$Token'");
+       } else if (token == "\\noun") {
+               lex.next();
+               font.setNoun(font.setLyXMisc(lex.getString()));
+       } else if (token == "\\color") {
+               lex.next();
+               font.setLyXColor(lex.getString());
+       } else if (token == "\\backslash") {
+#ifndef NO_COMPABILITY
+               if (ert_comp.active) {
+                       ert_comp.contents += "\\";
+               } else {
+#endif
+               par->insertChar(pos, '\\', font);
+               ++pos;
+#ifndef NO_COMPABILITY
+               }
+#endif
        } else if (token == "\\begin_float") {
                insertErtContents(par, pos);
                //insertErtContents(par, pos, false);
@@ -951,71 +1047,6 @@
        } else if (token == "\\float_placement") {
                lex.nextToken();
                params.float_placement = lex.getString();
-       } else if (token == "\\family") {
-               lex.next();
-               font.setLyXFamily(lex.getString());
-       } else if (token == "\\series") {
-               lex.next();
-               font.setLyXSeries(lex.getString());
-       } else if (token == "\\shape") {
-               lex.next();
-               font.setLyXShape(lex.getString());
-       } else if (token == "\\size") {
-               lex.next();
-               font.setLyXSize(lex.getString());
-#ifndef NO_COMPABILITY
-       } else if (token == "\\latex") {
-               lex.next();
-               string const tok = lex.getString();
-               if (tok == "no_latex") {
-                       // Do the insetert.
-                       insertErtContents(par, pos);
-               } else if (tok == "latex") {
-                       ert_comp.active = true;
-                       ert_comp.font = font;
-               } else if (tok == "default") {
-                       // Do the insetert.
-                       insertErtContents(par, pos);
-               } else {
-                       lex.printError("Unknown LaTeX font flag "
-                                      "`$$Token'");
-               }
-#endif
-       } else if (token == "\\lang") {
-               lex.next();
-               string const tok = lex.getString();
-               Language const * lang = languages.getLanguage(tok);
-               if (lang) {
-                       font.setLanguage(lang);
-               } else {
-                       font.setLanguage(params.language);
-                       lex.printError("Unknown language `$$Token'");
-               }
-       } else if (token == "\\numeric") {
-               lex.next();
-               font.setNumber(font.setLyXMisc(lex.getString()));
-       } else if (token == "\\emph") {
-               lex.next();
-               font.setEmph(font.setLyXMisc(lex.getString()));
-       } else if (token == "\\bar") {
-               lex.next();
-               string const tok = lex.getString();
-               // This is dirty, but gone with LyX3. (Asger)
-               if (tok == "under")
-                       font.setUnderbar(LyXFont::ON);
-               else if (tok == "no")
-                       font.setUnderbar(LyXFont::OFF);
-               else if (tok == "default")
-                       font.setUnderbar(LyXFont::INHERIT);
-               else
-                       lex.printError("Unknown bar font flag "
-                                      "`$$Token'");
-       } else if (token == "\\noun") {
-               lex.next();
-               font.setNoun(font.setLyXMisc(lex.getString()));
-       } else if (token == "\\color") {
-               lex.next();
-               font.setLyXColor(lex.getString());
        } else if (token == "\\align") {
                int tmpret = lex.findToken(string_align);
                if (tmpret == -1) ++tmpret;
@@ -1062,26 +1093,6 @@
                lex.eatLine();
                par->params().labelWidthString(lex.getString());
                // do not delete this token, it is still needed!
-       } else if (token == "\\end_inset") {
-               lyxerr << "Solitary \\end_inset. Missing \\begin_inset?.\n"
-                      << "Last inset read was: " << last_inset_read
-                      << endl;
-               // Simply ignore this. The insets do not have
-               // to read this.
-               // But insets should read it, it is a part of
-               // the inset isn't it? Lgb.
-       } else if (token == "\\begin_inset") {
-#ifndef NO_COMPABILITY
-               insertErtContents(par, pos, false);
-               ert_stack.push(ert_comp);
-               ert_comp = ErtComp();
-#endif
-               readInset(lex, par, pos, font);
-#ifndef NO_COMPABILITY
-               ert_comp = ert_stack.top();
-               ert_stack.pop();
-               insertErtContents(par, pos);
-#endif
        } else if (token == "\\SpecialChar") {
                LyXLayout const & layout =
                        textclasslist[params.textclass][par->layout()];
@@ -1157,17 +1168,6 @@
                        par->bibkey = new InsetBibKey(p);
                }
                par->bibkey->read(this, lex);
-       } else if (token == "\\backslash") {
-#ifndef NO_COMPABILITY
-               if (ert_comp.active) {
-                       ert_comp.contents += "\\";
-               } else {
-#endif
-               par->insertChar(pos, '\\', font);
-               ++pos;
-#ifndef NO_COMPABILITY
-               }
-#endif
        } else if (token == "\\the_end") {
 #ifndef NO_COMPABILITY
                // If we still have some ert active here we have to insert

[PATCH] Speed up parsing of documents

Reply via email to