InsetBibtex.cpp

rgheck Mon, 12 Jan 2009 05:56:06 -0800

Jürgen Spitzmüller wrote:

rgheck wrote:

Jurgen, I'd like to do this, r27975, r27987, and r27989 for branch.
Together, these improve the presentation of citation strings, using
unicode where appropriate and parsing the "family name" better than we
now do.


Could you post a patch?

Here.

rh

Index: src/insets/InsetBibtex.cpp
===================================================================
--- src/insets/InsetBibtex.cpp	(revision 28124)
+++ src/insets/InsetBibtex.cpp	(working copy)
@@ -38,6 +38,8 @@
 #include "support/Path.h"
 #include "support/textutils.h"
 
+#include <boost/regex.hpp>
+
 #include <limits>
 
 using namespace std;
@@ -532,15 +534,16 @@
 	/// the variable strings.
 	/// @return true if reading was successfull (all single parts were delimited
 	/// correctly)
-	bool readValue(docstring & val, ifdocstream & ifs, const VarMap & strings) {
+	bool readValue(docstring & value, ifdocstream & ifs, const VarMap & strings) {
 
 		char_type ch;
 
-		val.clear();
+		value.clear();
 
 		if (!ifs)
 			return false;
 
+		docstring val;
 		do {
 			// skip whitespace
 			do {
@@ -596,7 +599,7 @@
 						lastWasWhiteSpace = false;
 						val += ' ';
 					}
-					
+
 					val += ch;
 
 					// update nesting level
@@ -657,6 +660,98 @@
 
 		ifs.putback(ch);
 
+		// Ok, we now have the value. Now we are going to go
+		// through it and replace e.g. \"a with its unicode value.
+		// We'll also strip commands, like \emph, and the like, so 
+		// it will look nice in the UI.
+		bool scanning_cmd = false;
+		bool scanning_math = false;
+		bool escaped = false; // used to catch \$, etc.
+		while (val.size()) {
+			char_type const ch = val[0];
+
+			// if we're scanning math, we output everything until we
+			// find an unescaped $, at which point we break out.
+			if (scanning_math) {
+				if (escaped)
+					escaped = false;
+				else if (ch == '\\')
+					escaped = true;
+				else if (ch == '$') 
+					scanning_math = false;
+				value += ch;
+				val = val.substr(1);
+				continue;
+			}
+
+			// if we're scanning a command name, then we just
+			// discard characters until we hit something that
+			// isn't alpha.
+			if (scanning_cmd) {
+				if (isAlphaASCII(ch)) {
+					val = val.substr(1);
+					escaped = false;
+					continue;
+				}
+				// so we're done with this command.
+				// now we fall through and check this character.
+				scanning_cmd = false;
+			}
+
+			// was the last character a \? If so, then this is something like: \\,
+			// or \$, so we'll just output it. That's probably not always right...
+			if (escaped) {
+				value += ch;
+				val = val.substr(1);
+				escaped = false;
+				continue;
+			}
+
+			if (ch == '$') {
+				value += ch;
+				val = val.substr(1);
+				scanning_math = true;
+				continue;
+			}
+
+			// we just ignore braces
+			if (ch == '{' || ch == '}') {
+				val = val.substr(1);
+				continue;
+			}
+
+			// we're going to check things that look like commands, so if
+			// this doesn't, just output it.
+			if (ch != '\\') {
+				value += ch;
+				val = val.substr(1);
+				continue;
+			}
+
+			// ok, could be a command of some sort
+			// let's see if it corresponds to some unicode
+			// unicodesymbols has things in the form: \"{u},
+			// whereas we may see things like: \"u. So we'll
+			// look for that and change it, if necessary.
+			static boost::regex const reg("^\\\\\\W\\w");
+			if (boost::regex_search(to_utf8(val), reg)) {
+				val.insert(3, from_ascii("}"));
+				val.insert(2, from_ascii("{"));
+			}
+			docstring rem;
+			docstring const cnvtd = Encodings::fromLaTeXCommand(val, rem);
+			if (!cnvtd.empty()) {
+				// it did, so we'll take that bit and proceed with what's left
+				value += cnvtd;
+				val = rem;
+				continue;
+			}
+			// it's a command of some sort
+			scanning_cmd = true;
+			escaped = true;
+			val = val.substr(1);
+		}
+
 		return true;
 	}
 }
Index: src/support/lstrings.cpp
===================================================================
--- src/support/lstrings.cpp	(revision 28124)
+++ src/support/lstrings.cpp	(working copy)
@@ -94,6 +94,14 @@
 }
 
 
+bool isLower(char_type c)
+{
+	if (!is_utf16(c))
+		return false;
+	return ucs4_to_qchar(c).isLower();
+}
+
+
 bool isAlphaASCII(char_type c)
 {
 	return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
Index: src/support/textutils.h
===================================================================
--- src/support/textutils.h	(revision 28124)
+++ src/support/textutils.h	(working copy)
@@ -26,6 +26,9 @@
 /// return true if a char is alphabetical (including accented chars)
 bool isLetterChar(char_type c);
 
+/// return true if a char is lowercase
+bool isLower(char_type c);
+
 /// return whether \p c is an alphabetic character in the ASCII range
 bool isAlphaASCII(char_type c);
 
Index: src/BiblioInfo.cpp
===================================================================
--- src/BiblioInfo.cpp	(revision 28124)
+++ src/BiblioInfo.cpp	(working copy)
@@ -28,6 +28,7 @@
 #include "support/gettext.h"
 #include "support/lassert.h"
 #include "support/lstrings.h"
+#include "support/textutils.h"
 
 #include "boost/regex.hpp"
 
@@ -75,24 +76,45 @@
 	if (name.empty())
 		return docstring();
 
-	// Very simple parser
-	docstring fname = name;
+	// first we look for a comma, and take the last name to be everything
+	// preceding the right-most one, so that we also get the "jr" part.
+	docstring::size_type idx = name.rfind(',');
+	if (idx != docstring::npos)
+		return ltrim(name.substr(0, idx));
 
-	// possible authorname combinations are:
-	// "Surname, FirstName"
-	// "Surname, F."
-	// "FirstName Surname"
-	// "F. Surname"
-	docstring::size_type idx = fname.find(',');
-	if (idx != docstring::npos)
-		return ltrim(fname.substr(0, idx));
-	idx = fname.rfind('.');
-	if (idx != docstring::npos && idx + 1 < fname.size())
-		fname = ltrim(fname.substr(idx + 1));
-	// test if we have a LaTeX Space in front
-	if (fname[0] == '\\')
-		return fname.substr(2);
-	return rtrim(fname);
+	// OK, so now we want to look for the last name. We're going to
+	// include the "von" part. This isn't perfect.
+	// Split on spaces, to get various tokens.
+	vector<docstring> pieces = getVectorFromString(name, from_ascii(" "));
+	// If we only get two, assume the last one is the last name
+	if (pieces.size() <= 2)
+		return pieces.back();
+
+	// Now we look for the first token that begins with a lower case letter.
+	vector<docstring>::const_iterator it = pieces.begin();
+	vector<docstring>::const_iterator en = pieces.end();
+	for (; it != en; ++it) {
+		if ((*it).size() == 0)
+			continue;
+		char_type const c = (*it)[0];
+		if (isLower(c))
+			break;
+	}
+
+	if (it == en) // we never found a "von"
+		return pieces.back();
+
+	// reconstruct what we need to return
+	docstring retval;
+	bool first = true;
+	for (; it != en; ++it) {
+		if (!first)
+			retval += " ";
+		else 
+			first = false;
+		retval += *it;
+	}
+	return retval;
 }
 
 docstring const BibTeXInfo::getAbbreviatedAuthor() const
Index: src/Encoding.cpp
===================================================================
--- src/Encoding.cpp	(revision 28124)
+++ src/Encoding.cpp	(working copy)
@@ -503,11 +503,15 @@
 			// does start with '\', we accept the match only if
 			// this is a valid macro, i.e., either it is a single
 			// (nonletter) char macro, or nothing else follows,
-			// or what follows is a nonletter char.
+			// or what follows is a nonletter char, or the last
+			// character is a }.
 			if ((math == tmp || text == tmp)
 			    && (tmp[0] != '\\'
 				   || (tmp.size() == 2 && !isAlphaASCII(tmp[1]))
-				   || k == cmdend || !isAlphaASCII(cmd[k]))) {
+				   || k == cmdend 
+				   || !isAlphaASCII(cmd[k])
+				   || tmp[tmp.size() - 1] == '}')
+				 ) {
 				c = it->first;
 				j = k - 1;
 				i = j + 1;

Re: [Cvslog] r28105 - /lyx-devel/trunk/src/insets/InsetBibtex.cpp

Reply via email to