Hello,
attached is a more precise parser for bibtex files. It is based on the
description found on this web page:
http://artis.imag.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html
Most important improvements:
- treats '\n' and '\r' as whitespace characters. Thus the @ does not
need to be at the beginning of a line.
- it reads @string entries and replaces the strings in the field values
- it also handles @comment entries
- it recovers from syntax errors at the next entry starting with @
- it ignores everything between entries
Please help: As i have no experience or theoretical background with
unicode i cannot tell if i handled it correctly in this patch.
Bernhard
PS: It should be a good basis for fixing bug 2757.
Maybe bug 109 can also be closed with that?
------------------------------------------------------------------------
Index: src/insets/insetbibtex.C
===================================================================
--- src/insets/insetbibtex.C (revision 17558)
+++ src/insets/insetbibtex.C (working copy)
@@ -57,6 +57,7 @@
using support::subst;
using support::tokenPos;
using support::trim;
+using support::lowercase;
namespace Alert = frontend::Alert;
namespace os = support::os;
@@ -67,6 +68,7 @@
using std::ostream;
using std::pair;
using std::vector;
+using std::map;
InsetBibtex::InsetBibtex(InsetCommandParams const & p)
@@ -327,7 +329,187 @@
return vec;
}
+namespace {
+ // methods for parsing bibtex files
+
+ typedef map<docstring, docstring> VarMap;
+
+ bool isWS(char_type ch) {
+ return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
+ }
+
+ bool isNumeric(char_type ch) {
+ return ch >= '0' && ch <= '9';
+ }
+
+ /// remove whitespace characters, optionally a single comma, +
/// and further whitespace characters from the stream.
+ /// @return true if a comma was found, false otherwise
+ ///
+ bool removeWSAndComma(idocfstream & ifs) {
+ char_type ch;
+
+ if (ifs.eof()) + return false;
+
+ // skip whitespace
+ do {
+ ifs.get(ch);
+ } while (!ifs.eof() && isWS(ch));
+
+ if (ifs.eof()) + return false;
+
+ if (ch != ',') {
+ ifs.putback(ch);
+ return false;
+ }
+
+ // skip whitespace
+ do {
+ ifs.get(ch);
+ } while (!ifs.eof() && isWS(ch));
+
+ if (!ifs.eof()) {
+ ifs.putback(ch);
+ }
+
+ return true;
+ }
+
+ /// remove whitespace characters, read characer sequence
+ /// not containing whitespace characters or characters in
+ /// delimChars, and remove further whitespace characters.
+ /// @return true if a string of length > 0 could be read.
+ ///
+ bool readTypeOrKey(docstring & val, idocfstream & ifs, docstring
const & delimChars) {
+
+ char_type ch;
+
+ val.clear();
+
+ if (ifs.eof()) + return false;
+
+ // skip whitespace
+ do {
+ ifs.get(ch);
+ } while (!ifs.eof() && isWS(ch));
+
+ if (ifs.eof()) + return false;
+
+ // read value + while (!ifs.eof() && !isWS(ch) &&
delimChars.find(ch) == docstring::npos) {
+ val += lowercase(ch);
+ ifs.get(ch);
+ }
+
+ // skip whitespace
+ while (!ifs.eof() && isWS(ch)) {
+ ifs.get(ch);
+ }
+
+ if (!ifs.eof()) {
+ ifs.putback(ch);
+ }
+
+ return val.length() > 0;
+ }
+
+ /// read subsequent bibtex values that are delimited with a
#-character.
+ /// Concatenate all parts and replace names with the associated
string in + /// the variable strings.
+ /// @return true if reading was successfull (all single parts
were delimited
+ /// correctly)
+ bool readValue(docstring & val, idocfstream & ifs, const VarMap &
strings) {
+
+ char_type ch;
+
+ val.clear();
+
+ if (ifs.eof()) + return false;
+
+ do {
+ // skip whitespace
+ do {
+ ifs.get(ch);
+ } while (!ifs.eof() && isWS(ch));
+
+ if (ifs.eof())
+ return false;
+
+ // check for field delimiter / type
+ if (isNumeric(ch)) {
+ do {
+ val += ch;
+ ifs.get(ch);
+ } while (!ifs.eof() && isNumeric(ch));
+
+ if (ifs.eof())
+ return false;
+
+ } else if (ch == '"' || ch == '{') {
+ char_type delim = ch == '"'? '"': '}';
+ int nestLevel = 0;
+ ifs.get(ch);
+ while (!ifs.eof() && (nestLevel > 0 || ch != delim)) {
+ val += ch;
+ switch (ch) {
+ case '{':
+ ++nestLevel;
+ break;
+ case '}':
+ --nestLevel;
+ if (nestLevel < 0) return false;
+ break;
+ }
+ ifs.get(ch);
+ }
+
+ if (ifs.eof())
+ return false;
+
+ ifs.get(ch);
+
+ if (ifs.eof())
+ return false;
+ } else {
+ docstring strName;
+ while (!ifs.eof() && !isWS(ch) && ch != '#' && ch !=
',' && ch != '}' && ch != ')') {
+ strName += lowercase(ch);
+ ifs.get(ch);
+ }
+
+ if (ifs.eof())
+ return false;
+
+ if (strName.length()) {
+ VarMap::const_iterator pos = strings.find(strName);
+ if (pos != strings.end()) {
+ val += pos->second;
+ }
+ }
+ }
+
+ while (!ifs.eof() && isWS(ch)) {
+ ifs.get(ch);
+ }
+
+ if (ifs.eof())
+ return false;
+
+ } while (ch == '#');
+
+ ifs.putback(ch);
+
+ return true;
+ }
+}
+
+
// This method returns a comma separated list of Bibtex entries
void InsetBibtex::fillWithBibKeys(Buffer const & buffer,
std::vector<std::pair<string, docstring> > & keys) const
@@ -335,10 +517,25 @@
vector<FileName> const files = getFiles(buffer);
for (vector<FileName>::const_iterator it = files.begin();
it != files.end(); ++ it) {
- // This is a _very_ simple parser for Bibtex database
- // files. All it does is to look for lines starting
- // in @ and not being @preamble and @string entries.
- // It does NOT do any syntax checking!
+ // This bibtex parser is a first step to parse bibtex files
+ // more precisely. + // + // - it reads the
whole bibtex entry and does a syntax check
+ // (matching delimiters, missing commas,...
+ // - it recovers from errors starting with the next @-character
+ // - it reads @string definitions and replaces them in the
+ // field values.
+ // - it accepts more characters in keys or value names than
+ // bibtex does.
+ //
+ // TODOS:
+ // - the entries are split into name = value pairs by the
+ // parser. These have to be merged again because of the
+ // way lyx treats the entries ( pair<...>(...) ). The citation
+ // mechanism in lyx should be changed such that it can use
+ // the split entries.
+ // - messages on parsing errors can be generated.
+ //
// Officially bibtex does only support ASCII, but in practice
// you can use the encoding of the main document as long as
@@ -350,28 +547,104 @@
idocfstream ifs(it->toFilesystemEncoding().c_str(),
std::ios_base::in,
buffer.params().encoding().iconvName());
- docstring linebuf0;
- while (getline(ifs, linebuf0)) {
- docstring linebuf = trim(linebuf0);
- if (linebuf.empty())
- continue;
- if (prefixIs(linebuf, '@')) {
- linebuf = subst(linebuf, '{', '(');
- docstring tmp;
- linebuf = split(linebuf, tmp, '(');
- tmp = ascii_lowercase(tmp);
- if (!prefixIs(tmp, from_ascii("@string")) &&
- !prefixIs(tmp, from_ascii("@preamble"))) {
- linebuf = split(linebuf, tmp, ',');
- tmp = ltrim(tmp, " \t");
- if (!tmp.empty()) {
- // FIXME UNICODE
- keys.push_back(pair<string, docstring>(
- to_utf8(tmp), docstring()));
+
+ char_type ch;
+ VarMap strings;
+
+ while (!ifs.eof()) {
+
+ ifs.get(ch);
+
+ if (ch == '@') {
+ char_type entryDelim = 'x';
+
+ docstring entryType;
+
+ if (!readTypeOrKey(entryType, ifs, from_ascii("{("))
+ || ifs.eof())
+ continue;
+
+ if (entryType == from_ascii("comment")) {
+ ifs.ignore(std::numeric_limits<int>::max(), '\n');
+ } else {
+ // look for entry delimiter
+ ifs.get(ch);
+ if (ifs.eof()) + break;
+ if (ch == '(') entryDelim = ')';
+ else if (ch == '{') entryDelim = ')';
+
+ if (entryDelim == 'x') {
+ // invalid entry delimiter
+ ifs.putback(ch);
+ } else {
+ if (entryType == from_ascii("string")) {
+ // read string and add it to the strings
map + // (or replace it's old value)
+ docstring name;
+ docstring value;
+ if (!readTypeOrKey(name, ifs,
from_ascii("#=}),")) + || ifs.eof())
+ continue;
+ ifs.get(ch);
+ if (ifs.eof() || ch != '=')
+ continue;
+ if (!readValue(value, ifs, strings))
+ continue;
+ strings[name] = value;
+ } else if (entryType ==
from_ascii("preamble")) {
+ // preamble definitions are discarded.
+ // can they be of any use in lyx?
+ docstring value;
+ if (!readValue(value, ifs, strings))
+ continue;
+ } else {
+ docstring key;
+ docstring fields;
+ docstring name;
+ docstring value;
+ docstring newline;
+
+ if (!readTypeOrKey(key, ifs,
from_ascii(",})")) + || ifs.eof())
+ continue;
+
+ // now we have a key, so we will add an
entry + // (even if it's empty)
+ bool readNext = removeWSAndComma(ifs);
+
+ while (!ifs.eof() && readNext) {
+ // read field name
+ if (!readTypeOrKey(name, ifs,
from_ascii("=}),")) + || ifs.eof())
+ break;
+
+ // next char must be an equal sign
+ ifs.get(ch);
+ if (ifs.eof())
+ break;
+ if (ch != '=') {
+ ifs.putback(ch);
+ break;
+ }
+
+ // read field value
+ if (!readValue(value, ifs, strings))
+ break;
+
+ fields += newline;
+ fields += name + from_ascii(" = {") +
value + '}';
+ if (!newline.length()) newline =
from_ascii(",\n"); +
+ readNext = removeWSAndComma(ifs);
+ }
+
+ keys.push_back(pair<string, docstring>(
+ to_utf8(key), fields));
+ }
}
}
- } else if (!keys.empty())
- keys.back().second += linebuf + '\n';
+ }
}
}
}