Dear SWORD developers,

I wrote to this list about two years ago (4th February, 2010, to be precise) 
with a couple of suggestions and a patch for the SWORD library.

Unfortunately, the patch I suggested was not applied (an oversight, I'm 
sure), and I've been way too busy with other things to chase it up... until 
now.

I wrote:

> Firstly, thanks for developing the SWORD library!  I have been using
> this library, in conjunction with the BibleTime front-end, for many
> years.
>
> I have recently started to develop some OSIS documents of my own.
> In doing so, I found that the XML parser in osis2mod is somewhat
> fragile---something that you are, no doubt, aware of.
>
> In particular, osis2mod does not handle XML comments at all, nor
> does it correctly parse the <header> element.  Being able to handle
> XML comments is, I think, quite important---I like to document the
> SVN revision ID, for example, in an XML comment.
>
> Furthermore, the osis2mod XML parser looks for the first <div> in
> the document, no matter where that occurs.  In particular, if the
> OSIS document includes a <revisionDesc> tag in the header, it will
> have <p> tags as well---which will be translated by transformBSP()
> into <div> tags---and get used as the starting point for the
> document!
>
> For this reason, I have generated a quick patch that will solve
> these particular problems.  Could you please apply it to the SVN
> head for utilities/osis2mod.cpp.  Comments are handled similar to
> spaces: they are skipped.  And handleToken() now looks for the first
> <div> after the </revision> end tag.

DM Smith replied with:

> Sorry for the late reply. This patch looks good and we'll commit it
> shortly.

I am attaching the patch to this e-mail, as I find that the problem still 
exists in the library.  Could you please apply it?  Thanks!

Yours in Christ,

John Zaitseff

-- 
John Zaitseff                    ,--_|\    The ZAP Group
Phone:  +61 2 9643 7737         /      \   Sydney, Australia
E-mail: j.zaits...@zap.org.au   \_,--._*   http://www.zap.org.au/
                                      v
Index: utilities/osis2mod.cpp
===================================================================
--- utilities/osis2mod.cpp	(revision 2691)
+++ utilities/osis2mod.cpp	(working copy)
@@ -603,6 +603,7 @@
 
 	// Flag used to indicate where useful text begins
 	static bool               firstDiv        = false;
+	static bool               headerEnded     = false;
 
 	// Retain the sID of book, chapter and verse (commentary) divs so that we can find them again.
 	// This relies on transformBSP.
@@ -643,9 +644,9 @@
 			}
 		}
 
-		// throw away everything up to the first div
+		// throw away everything up to the first div (that is outside the header)
 		if (!firstDiv) {
-			if (tokenName == "div") {
+			if (headerEnded && (tokenName == "div")) {
 				if (debug & DEBUG_OTHER) {
 					cout << "DEBUG(FOUND): Found first div and pitching prior material: " << text << endl;
 				}
@@ -962,8 +963,16 @@
 			}
 		}
 
-		// We haven't seen the first div so there is nothing to do.
+		// We haven't seen the first div outside the header so there is little to do.
 		if (!firstDiv) {
+			if (tokenName == "header") {
+				headerEnded = true;
+
+				if (debug & DEBUG_OTHER) {
+					cout << "DEBUG(FOUND): End of header found" << endl;
+				}
+			}
+
 			// Collect the content so it can be used to suggest the module's conf.
 			return false;
 		}
@@ -1319,6 +1328,16 @@
 }
 
 void processOSIS(istream& infile) {
+	typedef enum {
+		CS_NOT_IN_COMMENT,		// or seen starting "<"
+		CS_SEEN_STARTING_EXCLAMATION,
+		CS_SEEN_STARTING_HYPHEN,
+		CS_IN_COMMENT,
+		CS_SEEN_ENDING_HYPHEN,
+		CS_SEEN_SECOND_ENDING_HYPHEN,
+		CS_SEEN_ENDING_GREATER_THAN
+	} t_commentstate;
+
 	activeOsisID[0] = '\0';
 
 	strcpy(currentOsisID,"N/A");
@@ -1333,13 +1352,15 @@
 
 	SWBuf token;
 	SWBuf text;
+	bool incomment = false;
+	t_commentstate commentstate = CS_NOT_IN_COMMENT;
 	bool intoken = false;
 	bool inWhitespace = false;
 	bool seeingSpace = false;
 	unsigned char curChar = '\0';
 
 	while (infile.good()) {
-		
+
 		int possibleChar = infile.get();
 
 		// skip the character if it is bad. infile.good() will catch the problem
@@ -1355,6 +1376,95 @@
 			continue;
 		}
 
+		// Handle XML comments starting with "<!--", ending with "-->"
+
+		if (intoken && !incomment) {
+			switch (commentstate) {
+				case CS_NOT_IN_COMMENT :
+					if (curChar == '!') {
+						commentstate = CS_SEEN_STARTING_EXCLAMATION;
+						token.append(curChar);
+						continue;
+					} else {
+						break;
+					}
+
+				case CS_SEEN_STARTING_EXCLAMATION :
+					if (curChar == '-') {
+						commentstate = CS_SEEN_STARTING_HYPHEN;
+						token.append(curChar);
+						continue;
+					} else {
+						commentstate = CS_NOT_IN_COMMENT;
+						break;
+					}
+
+				case CS_SEEN_STARTING_HYPHEN :
+					if (curChar == '-') {
+						incomment = true;
+						commentstate = CS_IN_COMMENT;
+						token.append(curChar);
+
+						if (debug & DEBUG_OTHER) {
+							cout << "DEBUG(COMMENTS): in comment" << endl;
+						}
+
+						continue;
+					} else {
+						commentstate = CS_NOT_IN_COMMENT;
+						break;
+					}
+
+				default:
+					cout << "FATAL(COMMENTS): unknown commentstate on comment start: " << commentstate << endl;
+					exit(EXIT_BAD_NESTING);
+			}
+		}
+
+		if (incomment) {
+			switch (commentstate) {
+				case CS_IN_COMMENT:
+					if (curChar == '-') {
+						commentstate = CS_SEEN_ENDING_HYPHEN;
+						continue;
+					} else {
+						// ignore the character
+						continue;
+					}
+
+				case CS_SEEN_ENDING_HYPHEN :
+					if (curChar == '-') {
+						commentstate = CS_SEEN_SECOND_ENDING_HYPHEN;
+						continue;
+					} else {
+						// ignore character
+						commentstate = CS_IN_COMMENT;
+						continue;
+					}
+
+				case CS_SEEN_SECOND_ENDING_HYPHEN :
+					if (curChar == '>') {
+						intoken = false;
+						incomment = false;
+						commentstate = CS_NOT_IN_COMMENT;
+
+						if (debug & DEBUG_OTHER) {
+							cout << "DEBUG(COMMENTS): out of comment" << endl;
+						}
+
+						continue;
+					} else {
+						// ignore character
+						commentstate = CS_IN_COMMENT;
+						continue;
+					}
+
+				default:
+					cout << "FATAL(COMMENTS): unknown commentstate on comment end: " << commentstate << endl;
+					exit(EXIT_BAD_NESTING);
+			}
+		}
+
 		// Outside of tokens merge adjacent whitespace
 		if (!intoken) {
 			seeingSpace = isspace(curChar)!=0;
@@ -1373,13 +1483,16 @@
 			inWhitespace = false;
 			token.append('>');
 			// take this isalpha if out to check for bugs in text
-			if ((isalpha(token[1])) || (isalpha(token[2]))) {
+			if (isalpha(token[1]) ||
+			    (((token[1] == '/') || (token[1] == '?')) && isalpha(token[2]))) {
 				//cout << "Handle:" << token.c_str() << endl;
 				XMLTag t = transformBSP(token.c_str());
 
 				if (!handleToken(text, t)) {
 					text.append(t);
 				}
+			} else {
+				cout << "WARNING(PARSE): malformed token: " << token << endl;
 			}
 			continue;
 		}
_______________________________________________
sword-devel mailing list: sword-devel@crosswire.org
http://www.crosswire.org/mailman/listinfo/sword-devel
Instructions to unsubscribe/change your settings at above page

Reply via email to