Bernhard Roider wrote:
Is anybody interested in this one? I tested it with some corrupt bib files and got the same citation entries as with bibtex whereas the current implementation displayed merely useless trash. The difference to bibtex is that this parser is accepts more characters in keys, names, and entry types (which can lead to errors if lyx has an entry that bibtex skips, but it is truly much closer to real bibtex).

I personally like the principle. I even think we should also support the _writing_ of new BibTeX entries. When we do that, we could then ditch the basic internal bibitem support. But I haven't reviewed the code... too many nested if and loops at first glance.

Abdel.



Bernhard


Bernhard Roider wrote:
Hello,

attached is a more precise parser for bibtex files. It is based on the description found on this web page: http://artis.imag.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html

Most important improvements:
- treats '\n' and '\r' as whitespace characters. Thus the @ does not need to be at the beginning of a line.
- it reads @string entries and replaces the strings in the field values
- it also handles @comment entries
- it recovers from syntax errors at the next entry starting with @
- it ignores everything between entries

Please help: As i have no experience or theoretical background with unicode i cannot tell if i handled it correctly in this patch.

Bernhard

PS: It should be a good basis for fixing bug 2757.
Maybe bug 109 can also be closed with that?


------------------------------------------------------------------------

Index: src/insets/insetbibtex.C
===================================================================
--- src/insets/insetbibtex.C    (revision 17558)
+++ src/insets/insetbibtex.C    (working copy)
@@ -57,6 +57,7 @@
 using support::subst;
 using support::tokenPos;
 using support::trim;
+using support::lowercase;
namespace Alert = frontend::Alert;
 namespace os = support::os;
@@ -67,6 +68,7 @@
 using std::ostream;
 using std::pair;
 using std::vector;
+using std::map;
InsetBibtex::InsetBibtex(InsetCommandParams const & p)
@@ -327,7 +329,187 @@
     return vec;
 }
+namespace { + // methods for parsing bibtex files
+
+    typedef map<docstring, docstring> VarMap;
+
+    bool isWS(char_type ch) {
+        return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
+    }
+
+    bool isNumeric(char_type ch) {
+        return ch >= '0' && ch <= '9';
+    }
+
+ /// remove whitespace characters, optionally a single comma, + /// and further whitespace characters from the stream.
+    /// @return true if a comma was found, false otherwise
+    ///
+    bool removeWSAndComma(idocfstream & ifs) {
+        char_type ch;
+
+        if (ifs.eof()) +            return false;
+
+        // skip whitespace
+        do {
+            ifs.get(ch);
+        } while (!ifs.eof() && isWS(ch));
+
+        if (ifs.eof()) +            return false;
+
+        if (ch != ',') {
+            ifs.putback(ch);
+            return false;
+        }
+
+        // skip whitespace
+        do {
+            ifs.get(ch);
+        } while (!ifs.eof() && isWS(ch));
+
+        if (!ifs.eof()) {
+            ifs.putback(ch);
+        }
+
+        return true;
+    }
+
+    /// remove whitespace characters, read characer sequence
+    /// not containing whitespace characters or characters in
+    /// delimChars, and remove further whitespace characters.
+    /// @return true if a string of length > 0 could be read.
+    ///
+ bool readTypeOrKey(docstring & val, idocfstream & ifs, docstring const & delimChars) {
+
+        char_type ch;
+
+        val.clear();
+
+        if (ifs.eof()) +            return false;
+
+        // skip whitespace
+        do {
+            ifs.get(ch);
+        } while (!ifs.eof() && isWS(ch));
+
+        if (ifs.eof()) +            return false;
+
+ // read value + while (!ifs.eof() && !isWS(ch) && delimChars.find(ch) == docstring::npos) {
+            val += lowercase(ch);
+            ifs.get(ch);
+        }
+
+        // skip whitespace
+        while (!ifs.eof() && isWS(ch)) {
+            ifs.get(ch);
+        }
+
+        if (!ifs.eof()) {
+            ifs.putback(ch);
+        }
+
+        return val.length() > 0;
+    }
+
+ /// read subsequent bibtex values that are delimited with a #-character. + /// Concatenate all parts and replace names with the associated string in + /// the variable strings. + /// @return true if reading was successfull (all single parts were delimited
+    /// correctly)
+ bool readValue(docstring & val, idocfstream & ifs, const VarMap & strings) { + + char_type ch;
+
+        val.clear();
+
+        if (ifs.eof()) +            return false;
+
+        do {
+            // skip whitespace
+            do {
+                ifs.get(ch);
+            } while (!ifs.eof() && isWS(ch));
+
+            if (ifs.eof())
+                return false;
+
+            // check for field delimiter / type
+            if (isNumeric(ch)) {
+                do {
+                    val += ch;
+                    ifs.get(ch);
+                } while (!ifs.eof() && isNumeric(ch));
+
+                if (ifs.eof())
+                    return false;
+
+            } else if (ch == '"' || ch == '{') {
+                char_type delim = ch == '"'? '"': '}';
+                int nestLevel = 0;
+                ifs.get(ch);
+                while (!ifs.eof() && (nestLevel > 0 || ch != delim)) {
+                    val += ch;
+                    switch (ch) {
+                        case '{':
+                            ++nestLevel;
+                            break;
+                        case '}':
+                            --nestLevel;
+                            if (nestLevel < 0) return false;
+                            break;
+                    }
+                    ifs.get(ch);
+                }
+
+                if (ifs.eof())
+                    return false;
+
+                ifs.get(ch);
+
+                if (ifs.eof())
+                    return false;
+            } else {
+                docstring strName;
+ while (!ifs.eof() && !isWS(ch) && ch != '#' && ch != ',' && ch != '}' && ch != ')') {
+                    strName += lowercase(ch);
+                    ifs.get(ch);
+                }
+
+                if (ifs.eof())
+                    return false;
+
+                if (strName.length()) {
+                    VarMap::const_iterator pos = strings.find(strName);
+                    if (pos != strings.end()) {
+                        val += pos->second;
+                    }
+                }
+            }
+
+            while (!ifs.eof() && isWS(ch)) {
+                ifs.get(ch);
+            }
+
+            if (ifs.eof())
+                return false;
+
+        } while (ch == '#');
+
+        ifs.putback(ch);
+
+        return true;
+    }
+}
+
+
 // This method returns a comma separated list of Bibtex entries
 void InsetBibtex::fillWithBibKeys(Buffer const & buffer,
         std::vector<std::pair<string, docstring> > & keys) const
@@ -335,10 +517,25 @@
     vector<FileName> const files = getFiles(buffer);
     for (vector<FileName>::const_iterator it = files.begin();
          it != files.end(); ++ it) {
-        // This is a _very_ simple parser for Bibtex database
-        // files. All it does is to look for lines starting
-        // in @ and not being @preamble and @string entries.
-        // It does NOT do any syntax checking!
+        // This bibtex parser is a first step to parse bibtex files
+ // more precisely. + // + // - it reads the whole bibtex entry and does a syntax check
+        //   (matching delimiters, missing commas,...
+        // - it recovers from errors starting with the next @-character
+ // - it reads @string definitions and replaces them in the + // field values. + // - it accepts more characters in keys or value names than + // bibtex does.
+        //
+        // TODOS:
+ // - the entries are split into name = value pairs by the + // parser. These have to be merged again because of the + // way lyx treats the entries ( pair<...>(...) ). The citation
+        //   mechanism in lyx should be changed such that it can use
+        //   the split entries.
+        // - messages on parsing errors can be generated.
+        //
// Officially bibtex does only support ASCII, but in practice
         // you can use the encoding of the main document as long as
@@ -350,28 +547,104 @@
         idocfstream ifs(it->toFilesystemEncoding().c_str(),
                         std::ios_base::in,
                         buffer.params().encoding().iconvName());
-        docstring linebuf0;
-        while (getline(ifs, linebuf0)) {
-            docstring linebuf = trim(linebuf0);
-            if (linebuf.empty())
-                continue;
-            if (prefixIs(linebuf, '@')) {
-                linebuf = subst(linebuf, '{', '(');
-                docstring tmp;
-                linebuf = split(linebuf, tmp, '(');
-                tmp = ascii_lowercase(tmp);
-                if (!prefixIs(tmp, from_ascii("@string")) &&
-                    !prefixIs(tmp, from_ascii("@preamble"))) {
-                    linebuf = split(linebuf, tmp, ',');
-                    tmp = ltrim(tmp, " \t");
-                    if (!tmp.empty()) {
-                        // FIXME UNICODE
-                        keys.push_back(pair<string, docstring>(
-                            to_utf8(tmp), docstring()));
+ + char_type ch;
+        VarMap strings;
+
+        while (!ifs.eof()) {
+
+            ifs.get(ch);
+
+            if (ch == '@') {
+                char_type entryDelim = 'x';
+
+                docstring entryType;
+
+ if (!readTypeOrKey(entryType, ifs, from_ascii("{(")) + || ifs.eof())
+                    continue;
+
+                if (entryType == from_ascii("comment")) {
+                    ifs.ignore(std::numeric_limits<int>::max(), '\n');
+                } else {
+                    // look for entry delimiter
+                    ifs.get(ch);
+                    if (ifs.eof()) +                        break;
+                    if (ch == '(') entryDelim = ')';
+                    else if (ch == '{') entryDelim = ')';
+
+                    if (entryDelim == 'x') {
+                        // invalid entry delimiter
+                        ifs.putback(ch);
+                    } else {
+                        if (entryType == from_ascii("string")) {
+ // read string and add it to the strings map + // (or replace it's old value)
+                            docstring name;
+                            docstring value;
+ if (!readTypeOrKey(name, ifs, from_ascii("#=}),")) + || ifs.eof())
+                                continue;
+                            ifs.get(ch);
+                            if (ifs.eof() || ch != '=')
+                                continue;
+                            if (!readValue(value, ifs, strings))
+                                continue;
+                            strings[name] = value;
+ } else if (entryType == from_ascii("preamble")) { + // preamble definitions are discarded. + // can they be of any use in lyx?
+                            docstring value;
+                            if (!readValue(value, ifs, strings))
+                                continue;
+                        } else {
+                            docstring key;
+                            docstring fields;
+                            docstring name;
+                            docstring value;
+                            docstring newline;
+
+ if (!readTypeOrKey(key, ifs, from_ascii(",})")) + || ifs.eof())
+                                continue;
+ + // now we have a key, so we will add an entry + // (even if it's empty)
+                            bool readNext = removeWSAndComma(ifs);
+
+                            while (!ifs.eof() && readNext) {
+                                // read field name
+ if (!readTypeOrKey(name, ifs, from_ascii("=}),")) + || ifs.eof())
+                                    break;
+
+                                // next char must be an equal sign
+                                ifs.get(ch);
+                                if (ifs.eof())
+                                    break;
+                                if (ch != '=') {
+                                    ifs.putback(ch);
+                                    break;
+                                }
+
+                                // read field value
+ if (!readValue(value, ifs, strings)) + break;
+
+                                fields += newline;
+ fields += name + from_ascii(" = {") + value + '}'; + if (!newline.length()) newline = from_ascii(",\n"); +
+                                readNext = removeWSAndComma(ifs);
+                            }
+
+                            keys.push_back(pair<string, docstring>(
+                            to_utf8(key), fields));
+                        }
                     }
                 }
-            } else if (!keys.empty())
-                keys.back().second += linebuf + '\n';
+            }
         }
     }
 }



Reply via email to