[patch] new bibtex parser (bug 1826)

Bernhard Roider Sun, 25 Mar 2007 14:43:03 -0800

Hello,

attached is a more precise parser for bibtex files. It is based on the description found on this webpage: http://artis.imag.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html


Most important improvements:

- treats '\n' and '\r' as whitespace characters. Thus the @ does not need to be at the beginning ofa line.

- it reads @string entries and replaces the strings in the field values
- it also handles @comment entries
- it recovers from syntax errors at the next entry starting with @
- it ignores everything between entries

Please help: As i have no experience or theoretical background with unicode i cannot tell if ihandled it correctly in this patch.


Bernhard

PS: It should be a good basis for fixing bug 2757.
Maybe bug 109 can also be closed with that?

Index: src/insets/insetbibtex.C
===================================================================
--- src/insets/insetbibtex.C    (revision 17558)
+++ src/insets/insetbibtex.C    (working copy)
@@ -57,6 +57,7 @@
 using support::subst;
 using support::tokenPos;
 using support::trim;
+using support::lowercase;
 
 namespace Alert = frontend::Alert;
 namespace os = support::os;
@@ -67,6 +68,7 @@
 using std::ostream;
 using std::pair;
 using std::vector;
+using std::map;
 
 
 InsetBibtex::InsetBibtex(InsetCommandParams const & p)
@@ -327,7 +329,187 @@
        return vec;
 }
 
+namespace {
 
+       // methods for parsing bibtex files
+
+       typedef map<docstring, docstring> VarMap;
+
+       bool isWS(char_type ch) {
+               return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
+       }
+
+       bool isNumeric(char_type ch) {
+               return ch >= '0' && ch <= '9';
+       }
+
+       /// remove whitespace characters, optionally a single comma, 
+       /// and further whitespace characters from the stream.
+       /// @return true if a comma was found, false otherwise
+       ///
+       bool removeWSAndComma(idocfstream & ifs) {
+               char_type ch;
+
+               if (ifs.eof()) 
+                       return false;
+
+               // skip whitespace
+               do {
+                       ifs.get(ch);
+               } while (!ifs.eof() && isWS(ch));
+
+               if (ifs.eof()) 
+                       return false;
+
+               if (ch != ',') {
+                       ifs.putback(ch);
+                       return false;
+               }
+
+               // skip whitespace
+               do {
+                       ifs.get(ch);
+               } while (!ifs.eof() && isWS(ch));
+
+               if (!ifs.eof()) {
+                       ifs.putback(ch);
+               }
+
+               return true;
+       }
+
+       /// remove whitespace characters, read characer sequence
+       /// not containing whitespace characters or characters in
+       /// delimChars, and remove further whitespace characters.
+       /// @return true if a string of length > 0 could be read.
+       ///
+       bool readTypeOrKey(docstring & val, idocfstream & ifs, docstring const 
& delimChars) {
+
+               char_type ch;
+
+               val.clear();
+
+               if (ifs.eof()) 
+                       return false;
+
+               // skip whitespace
+               do {
+                       ifs.get(ch);
+               } while (!ifs.eof() && isWS(ch));
+
+               if (ifs.eof()) 
+                       return false;
+
+               // read value 
+               while (!ifs.eof() && !isWS(ch) && delimChars.find(ch) == 
docstring::npos) {
+                       val += lowercase(ch);
+                       ifs.get(ch);
+               }
+
+               // skip whitespace
+               while (!ifs.eof() && isWS(ch)) {
+                       ifs.get(ch);
+               }
+
+               if (!ifs.eof()) {
+                       ifs.putback(ch);
+               }
+
+               return val.length() > 0;
+       }
+
+       /// read subsequent bibtex values that are delimited with a #-character.
+       /// Concatenate all parts and replace names with the associated string 
in 
+       /// the variable strings.
+       /// @return true if reading was successfull (all single parts were 
delimited
+       /// correctly)
+       bool readValue(docstring & val, idocfstream & ifs, const VarMap & 
strings) {
+               
+               char_type ch;
+
+               val.clear();
+
+               if (ifs.eof()) 
+                       return false;
+
+               do {
+                       // skip whitespace
+                       do {
+                               ifs.get(ch);
+                       } while (!ifs.eof() && isWS(ch));
+
+                       if (ifs.eof())
+                               return false;
+
+                       // check for field delimiter / type
+                       if (isNumeric(ch)) {
+                               do {
+                                       val += ch;
+                                       ifs.get(ch);
+                               } while (!ifs.eof() && isNumeric(ch));
+
+                               if (ifs.eof())
+                                       return false;
+
+                       } else if (ch == '"' || ch == '{') {
+                               char_type delim = ch == '"'? '"': '}';
+                               int nestLevel = 0;
+                               ifs.get(ch);
+                               while (!ifs.eof() && (nestLevel > 0 || ch != 
delim)) {
+                                       val += ch;
+                                       switch (ch) {
+                                               case '{':
+                                                       ++nestLevel;
+                                                       break;
+                                               case '}':
+                                                       --nestLevel;
+                                                       if (nestLevel < 0) 
return false;
+                                                       break;
+                                       }
+                                       ifs.get(ch);
+                               }
+
+                               if (ifs.eof())
+                                       return false;
+
+                               ifs.get(ch);
+
+                               if (ifs.eof())
+                                       return false;
+                       } else {
+                               docstring strName;
+                               while (!ifs.eof() && !isWS(ch) && ch != '#' && 
ch != ',' && ch != '}' && ch != ')') {
+                                       strName += lowercase(ch);
+                                       ifs.get(ch);
+                               }
+
+                               if (ifs.eof())
+                                       return false;
+
+                               if (strName.length()) {
+                                       VarMap::const_iterator pos = 
strings.find(strName);
+                                       if (pos != strings.end()) {
+                                               val += pos->second;
+                                       }
+                               }
+                       }
+
+                       while (!ifs.eof() && isWS(ch)) {
+                               ifs.get(ch);
+                       }
+
+                       if (ifs.eof())
+                               return false;
+
+               } while (ch == '#');
+
+               ifs.putback(ch);
+
+               return true;
+       }
+}
+
+
 // This method returns a comma separated list of Bibtex entries
 void InsetBibtex::fillWithBibKeys(Buffer const & buffer,
                std::vector<std::pair<string, docstring> > & keys) const
@@ -335,10 +517,25 @@
        vector<FileName> const files = getFiles(buffer);
        for (vector<FileName>::const_iterator it = files.begin();
             it != files.end(); ++ it) {
-               // This is a _very_ simple parser for Bibtex database
-               // files. All it does is to look for lines starting
-               // in @ and not being @preamble and @string entries.
-               // It does NOT do any syntax checking!
+           // This bibtex parser is a first step to parse bibtex files
+               // more precisely. 
+               // 
+               // - it reads the whole bibtex entry and does a syntax check
+               //   (matching delimiters, missing commas,...
+               // - it recovers from errors starting with the next @-character
+               // - it reads @string definitions and replaces them in the 
+               //   field values.
+               // - it accepts more characters in keys or value names than 
+               //   bibtex does.
+               //
+               // TODOS:
+               // - the entries are split into name = value pairs by the 
+               //   parser. These have to be merged again because of the 
+               //   way lyx treats the entries ( pair<...>(...) ). The citation
+               //   mechanism in lyx should be changed such that it can use
+               //   the split entries.
+               // - messages on parsing errors can be generated.
+               //
 
                // Officially bibtex does only support ASCII, but in practice
                // you can use the encoding of the main document as long as
@@ -350,28 +547,104 @@
                idocfstream ifs(it->toFilesystemEncoding().c_str(),
                                std::ios_base::in,
                                buffer.params().encoding().iconvName());
-               docstring linebuf0;
-               while (getline(ifs, linebuf0)) {
-                       docstring linebuf = trim(linebuf0);
-                       if (linebuf.empty())
-                               continue;
-                       if (prefixIs(linebuf, '@')) {
-                               linebuf = subst(linebuf, '{', '(');
-                               docstring tmp;
-                               linebuf = split(linebuf, tmp, '(');
-                               tmp = ascii_lowercase(tmp);
-                               if (!prefixIs(tmp, from_ascii("@string")) &&
-                                   !prefixIs(tmp, from_ascii("@preamble"))) {
-                                       linebuf = split(linebuf, tmp, ',');
-                                       tmp = ltrim(tmp, " \t");
-                                       if (!tmp.empty()) {
-                                               // FIXME UNICODE
-                                               keys.push_back(pair<string, 
docstring>(
-                                                       to_utf8(tmp), 
docstring()));
+               
+               char_type ch;
+               VarMap strings;
+
+               while (!ifs.eof()) {
+
+                       ifs.get(ch);
+
+                       if (ch == '@') {
+                               char_type entryDelim = 'x';
+
+                               docstring entryType;
+
+                               if (!readTypeOrKey(entryType, ifs, 
from_ascii("{(")) 
+                                       || ifs.eof())
+                                       continue;
+
+                               if (entryType == from_ascii("comment")) {
+                                       
ifs.ignore(std::numeric_limits<int>::max(), '\n');
+                               } else {
+                                       // look for entry delimiter
+                                       ifs.get(ch);
+                                       if (ifs.eof()) 
+                                               break;
+                                       if (ch == '(') entryDelim = ')';
+                                       else if (ch == '{') entryDelim = ')';
+
+                                       if (entryDelim == 'x') {
+                                               // invalid entry delimiter
+                                               ifs.putback(ch);
+                                       } else {
+                                               if (entryType == 
from_ascii("string")) {
+                                                       // read string and add 
it to the strings map 
+                                                       // (or replace it's old 
value)
+                                                       docstring name;
+                                                       docstring value;
+                                                       if 
(!readTypeOrKey(name, ifs, from_ascii("#=}),")) 
+                                                               || ifs.eof())
+                                                               continue;
+                                                       ifs.get(ch);
+                                                       if (ifs.eof() || ch != 
'=')
+                                                               continue;
+                                                       if (!readValue(value, 
ifs, strings))
+                                                               continue;
+                                                       strings[name] = value;
+                                               } else if (entryType == 
from_ascii("preamble")) {
+                                                       // preamble definitions 
are discarded. 
+                                                       // can they be of any 
use in lyx?
+                                                       docstring value;
+                                                       if (!readValue(value, 
ifs, strings))
+                                                               continue;
+                                               } else {
+                                                       docstring key;
+                                                       docstring fields;
+                                                       docstring name;
+                                                       docstring value;
+                                                       docstring newline;
+
+                                                       if (!readTypeOrKey(key, 
ifs, from_ascii(",})")) 
+                                                               || ifs.eof())
+                                                               continue;
+                                                       
+                                                       // now we have a key, 
so we will add an entry 
+                                                       // (even if it's empty)
+                                                       bool readNext = 
removeWSAndComma(ifs);
+
+                                                       while (!ifs.eof() && 
readNext) {
+                                                               // read field 
name
+                                                               if 
(!readTypeOrKey(name, ifs, from_ascii("=}),")) 
+                                                                       || 
ifs.eof())
+                                                                       break;
+
+                                                               // next char 
must be an equal sign
+                                                               ifs.get(ch);
+                                                               if (ifs.eof())
+                                                                       break;
+                                                               if (ch != '=') {
+                                                                       
ifs.putback(ch);
+                                                                       break;
+                                                               }
+
+                                                               // read field 
value
+                                                               if 
(!readValue(value, ifs, strings)) 
+                                                                       break;
+
+                                                               fields += 
newline;
+                                                               fields += name 
+ from_ascii(" = {") + value + '}';
+                                                               if 
(!newline.length()) newline = from_ascii(",\n"); 
+
+                                                               readNext = 
removeWSAndComma(ifs);
+                                                       }
+
+                                                       
keys.push_back(pair<string, docstring>(
+                                                       to_utf8(key), fields));
+                                               }
                                        }
                                }
-                       } else if (!keys.empty())
-                               keys.back().second += linebuf + '\n';
+                       }
                }
        }
 }

[patch] new bibtex parser (bug 1826)

Reply via email to