Re: [patch] new bibtex parser (bug 1826)

Abdelrazak Younes Thu, 29 Mar 2007 12:50:27 -0800

Bernhard Roider wrote:

Is anybody interested in this one? I tested it with some corrupt bibfiles and got the same citation entries as with bibtex whereas thecurrent implementation displayed merely useless trash. The difference tobibtex is that this parser is accepts more characters in keys, names,and entry types (which can lead to errors if lyx has an entry thatbibtex skips, but it is truly much closer to real bibtex).

I personally like the principle. I even think we should also support the_writing_ of new BibTeX entries. When we do that, we could then ditchthe basic internal bibitem support.But I haven't reviewed the code... too many nested if and loops at firstglance.


Abdel.



Bernhard


Bernhard Roider wrote:

Hello,

attached is a more precise parser for bibtex files. It is based on thedescription found on this web page:http://artis.imag.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html


Most important improvements:

- treats '\n' and '\r' as whitespace characters. Thus the @ does notneed to be at the beginning of a line.

- it reads @string entries and replaces the strings in the field values
- it also handles @comment entries
- it recovers from syntax errors at the next entry starting with @
- it ignores everything between entries

Please help: As i have no experience or theoretical background withunicode i cannot tell if i handled it correctly in this patch.


Bernhard

PS: It should be a good basis for fixing bug 2757.
Maybe bug 109 can also be closed with that?


------------------------------------------------------------------------

Index: src/insets/insetbibtex.C
===================================================================
--- src/insets/insetbibtex.C    (revision 17558)
+++ src/insets/insetbibtex.C    (working copy)
@@ -57,6 +57,7 @@
 using support::subst;
 using support::tokenPos;
 using support::trim;
+using support::lowercase;

namespace Alert = frontend::Alert;

 namespace os = support::os;
@@ -67,6 +68,7 @@
 using std::ostream;
 using std::pair;
 using std::vector;
+using std::map;

InsetBibtex::InsetBibtex(InsetCommandParams const & p)

@@ -327,7 +329,187 @@
     return vec;
 }

+namespace {+ // methods for parsing bibtex files

+
+    typedef map<docstring, docstring> VarMap;
+
+    bool isWS(char_type ch) {
+        return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
+    }
+
+    bool isNumeric(char_type ch) {
+        return ch >= '0' && ch <= '9';
+    }
+

+ /// remove whitespace characters, optionally a single comma, +/// and further whitespace characters from the stream.

+    /// @return true if a comma was found, false otherwise
+    ///
+    bool removeWSAndComma(idocfstream & ifs) {
+        char_type ch;
+
+        if (ifs.eof()) +            return false;
+
+        // skip whitespace
+        do {
+            ifs.get(ch);
+        } while (!ifs.eof() && isWS(ch));
+
+        if (ifs.eof()) +            return false;
+
+        if (ch != ',') {
+            ifs.putback(ch);
+            return false;
+        }
+
+        // skip whitespace
+        do {
+            ifs.get(ch);
+        } while (!ifs.eof() && isWS(ch));
+
+        if (!ifs.eof()) {
+            ifs.putback(ch);
+        }
+
+        return true;
+    }
+
+    /// remove whitespace characters, read characer sequence
+    /// not containing whitespace characters or characters in
+    /// delimChars, and remove further whitespace characters.
+    /// @return true if a string of length > 0 could be read.
+    ///

+ bool readTypeOrKey(docstring & val, idocfstream & ifs, docstringconst & delimChars) {

+
+        char_type ch;
+
+        val.clear();
+
+        if (ifs.eof()) +            return false;
+
+        // skip whitespace
+        do {
+            ifs.get(ch);
+        } while (!ifs.eof() && isWS(ch));
+
+        if (ifs.eof()) +            return false;
+

+ // read value + while (!ifs.eof() && !isWS(ch) &&delimChars.find(ch) == docstring::npos) {

+            val += lowercase(ch);
+            ifs.get(ch);
+        }
+
+        // skip whitespace
+        while (!ifs.eof() && isWS(ch)) {
+            ifs.get(ch);
+        }
+
+        if (!ifs.eof()) {
+            ifs.putback(ch);
+        }
+
+        return val.length() > 0;
+    }
+

+ /// read subsequent bibtex values that are delimited with a#-character.+ /// Concatenate all parts and replace names with the associatedstring in + /// the variable strings.+ /// @return true if reading was successfull (all single partswere delimited

+    /// correctly)

+ bool readValue(docstring & val, idocfstream & ifs, const VarMap &strings) {++ char_type ch;

+
+        val.clear();
+
+        if (ifs.eof()) +            return false;
+
+        do {
+            // skip whitespace
+            do {
+                ifs.get(ch);
+            } while (!ifs.eof() && isWS(ch));
+
+            if (ifs.eof())
+                return false;
+
+            // check for field delimiter / type
+            if (isNumeric(ch)) {
+                do {
+                    val += ch;
+                    ifs.get(ch);
+                } while (!ifs.eof() && isNumeric(ch));
+
+                if (ifs.eof())
+                    return false;
+
+            } else if (ch == '"' || ch == '{') {
+                char_type delim = ch == '"'? '"': '}';
+                int nestLevel = 0;
+                ifs.get(ch);
+                while (!ifs.eof() && (nestLevel > 0 || ch != delim)) {
+                    val += ch;
+                    switch (ch) {
+                        case '{':
+                            ++nestLevel;
+                            break;
+                        case '}':
+                            --nestLevel;
+                            if (nestLevel < 0) return false;
+                            break;
+                    }
+                    ifs.get(ch);
+                }
+
+                if (ifs.eof())
+                    return false;
+
+                ifs.get(ch);
+
+                if (ifs.eof())
+                    return false;
+            } else {
+                docstring strName;

+ while (!ifs.eof() && !isWS(ch) && ch != '#' && ch !=',' && ch != '}' && ch != ')') {

+                    strName += lowercase(ch);
+                    ifs.get(ch);
+                }
+
+                if (ifs.eof())
+                    return false;
+
+                if (strName.length()) {
+                    VarMap::const_iterator pos = strings.find(strName);
+                    if (pos != strings.end()) {
+                        val += pos->second;
+                    }
+                }
+            }
+
+            while (!ifs.eof() && isWS(ch)) {
+                ifs.get(ch);
+            }
+
+            if (ifs.eof())
+                return false;
+
+        } while (ch == '#');
+
+        ifs.putback(ch);
+
+        return true;
+    }
+}
+
+
 // This method returns a comma separated list of Bibtex entries
 void InsetBibtex::fillWithBibKeys(Buffer const & buffer,
         std::vector<std::pair<string, docstring> > & keys) const
@@ -335,10 +517,25 @@
     vector<FileName> const files = getFiles(buffer);
     for (vector<FileName>::const_iterator it = files.begin();
          it != files.end(); ++ it) {
-        // This is a _very_ simple parser for Bibtex database
-        // files. All it does is to look for lines starting
-        // in @ and not being @preamble and @string entries.
-        // It does NOT do any syntax checking!
+        // This bibtex parser is a first step to parse bibtex files

+ // more precisely. + // + // - it reads thewhole bibtex entry and does a syntax check

+        //   (matching delimiters, missing commas,...
+        // - it recovers from errors starting with the next @-character

+ // - it reads @string definitions and replaces them in the+ // field values.+ // - it accepts more characters in keys or value names than+ // bibtex does.

+        //
+        // TODOS:

+ // - the entries are split into name = value pairs by the+ // parser. These have to be merged again because of the+ // way lyx treats the entries ( pair<...>(...) ). The citation

+        //   mechanism in lyx should be changed such that it can use
+        //   the split entries.
+        // - messages on parsing errors can be generated.
+        //

// Officially bibtex does only support ASCII, but in practice

         // you can use the encoding of the main document as long as
@@ -350,28 +547,104 @@
         idocfstream ifs(it->toFilesystemEncoding().c_str(),
                         std::ios_base::in,
                         buffer.params().encoding().iconvName());
-        docstring linebuf0;
-        while (getline(ifs, linebuf0)) {
-            docstring linebuf = trim(linebuf0);
-            if (linebuf.empty())
-                continue;
-            if (prefixIs(linebuf, '@')) {
-                linebuf = subst(linebuf, '{', '(');
-                docstring tmp;
-                linebuf = split(linebuf, tmp, '(');
-                tmp = ascii_lowercase(tmp);
-                if (!prefixIs(tmp, from_ascii("@string")) &&
-                    !prefixIs(tmp, from_ascii("@preamble"))) {
-                    linebuf = split(linebuf, tmp, ',');
-                    tmp = ltrim(tmp, " \t");
-                    if (!tmp.empty()) {
-                        // FIXME UNICODE
-                        keys.push_back(pair<string, docstring>(
-                            to_utf8(tmp), docstring()));

++ char_type ch;

+        VarMap strings;
+
+        while (!ifs.eof()) {
+
+            ifs.get(ch);
+
+            if (ch == '@') {
+                char_type entryDelim = 'x';
+
+                docstring entryType;
+

+ if (!readTypeOrKey(entryType, ifs, from_ascii("{("))+ || ifs.eof())

+                    continue;
+
+                if (entryType == from_ascii("comment")) {
+                    ifs.ignore(std::numeric_limits<int>::max(), '\n');
+                } else {
+                    // look for entry delimiter
+                    ifs.get(ch);
+                    if (ifs.eof()) +                        break;
+                    if (ch == '(') entryDelim = ')';
+                    else if (ch == '{') entryDelim = ')';
+
+                    if (entryDelim == 'x') {
+                        // invalid entry delimiter
+                        ifs.putback(ch);
+                    } else {
+                        if (entryType == from_ascii("string")) {

+ // read string and add it to the stringsmap + // (or replace it's old value)

+                            docstring name;
+                            docstring value;

+ if (!readTypeOrKey(name, ifs,from_ascii("#=}),")) + || ifs.eof())

+                                continue;
+                            ifs.get(ch);
+                            if (ifs.eof() || ch != '=')
+                                continue;
+                            if (!readValue(value, ifs, strings))
+                                continue;
+                            strings[name] = value;

+ } else if (entryType ==from_ascii("preamble")) {+ // preamble definitions are discarded.+ // can they be of any use in lyx?

+                            docstring value;
+                            if (!readValue(value, ifs, strings))
+                                continue;
+                        } else {
+                            docstring key;
+                            docstring fields;
+                            docstring name;
+                            docstring value;
+                            docstring newline;
+

+ if (!readTypeOrKey(key, ifs,from_ascii(",})")) + || ifs.eof())

+                                continue;

++ // now we have a key, so we will add anentry + // (even if it's empty)

+                            bool readNext = removeWSAndComma(ifs);
+
+                            while (!ifs.eof() && readNext) {
+                                // read field name

+ if (!readTypeOrKey(name, ifs,from_ascii("=}),")) + || ifs.eof())

+                                    break;
+
+                                // next char must be an equal sign
+                                ifs.get(ch);
+                                if (ifs.eof())
+                                    break;
+                                if (ch != '=') {
+                                    ifs.putback(ch);
+                                    break;
+                                }
+
+                                // read field value

+ if (!readValue(value, ifs, strings))+ break;

+
+                                fields += newline;

+ fields += name + from_ascii(" = {") +value + '}';+ if (!newline.length()) newline =from_ascii(",\n"); +

+                                readNext = removeWSAndComma(ifs);
+                            }
+
+                            keys.push_back(pair<string, docstring>(
+                            to_utf8(key), fields));
+                        }
                     }
                 }
-            } else if (!keys.empty())
-                keys.back().second += linebuf + '\n';
+            }
         }
     }
 }

Re: [patch] new bibtex parser (bug 1826)

Reply via email to