[xml] [PATCH] less-than character and HTML parser module

Christian Schoenebeck Tue, 14 Apr 2015 08:24:00 -0700

On Tuesday 14 April 2015 09:31:25 Alex Bligh wrote:
> On 13 Apr 2015, at 22:43, Christian Schoenebeck <schoeneb...@crudebyte.com> 
wrote:
> > I just encountered an issue with stand-alone less-than characters if the
> > document is parsed by libxml2's HTML parser module. Consider you have a
> > text
> > 
> > in your HTML document like:
> >     a < b
> > 
> > The less-than sign in this case is interpreted by the HTML parser module
> > as tag start, causing subsequent text (in this case "< b") to be
> > dropped.
> 
> Isn't that correct? Shouldn't your document have
> 
>      a &lt; b


If it was a well-formed HTML document, then yes. But as said, in reality there 
are a load of HTML documents which contain text with raw less-than characters, 
supported by the fact that all major HTML browsers can handle it. libxml's 
HTML parser is yet an exception here.

Attached you find a patch, suggesting a fix for this issue.

Best regards,
Christian Schoenebeck

diff -u libxml2-2.9.1+dfsg1.orig/HTMLparser.c libxml2-2.9.1+dfsg1/HTMLparser.c
--- libxml2-2.9.1+dfsg1.orig/HTMLparser.c	2015-04-14 13:05:01.000000000 +0200
+++ libxml2-2.9.1+dfsg1/HTMLparser.c	2015-04-14 16:25:19.999633601 +0200
@@ -2948,8 +2948,10 @@
 
 
 /**
- * htmlParseCharData:
+ * htmlParseCharDataInternal:
  * @ctxt:  an HTML parser context
+ * @prep:  optional character to be prepended to text, 0 if no character
+ *         shall be prepended
  *
  * parse a CharData section.
  * if we are within a CDATA section ']]>' marks an end of section.
@@ -2958,12 +2960,15 @@
  */
 
 static void
-htmlParseCharData(htmlParserCtxtPtr ctxt) {
-    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
+htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, char prep) {
+    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
     int nbchar = 0;
     int cur, l;
     int chunk = 0;
 
+    if (prep)
+	buf[nbchar++] = prep;
+
     SHRINK;
     cur = CUR_CHAR(l);
     while (((cur != '<') || (ctxt->token == '<')) &&
@@ -3043,6 +3048,21 @@
 }
 
 /**
+ * htmlParseCharData:
+ * @ctxt:  an HTML parser context
+ *
+ * parse a CharData section.
+ * if we are within a CDATA section ']]>' marks an end of section.
+ *
+ * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
+ */
+
+static void
+htmlParseCharData(htmlParserCtxtPtr ctxt) {
+    htmlParseCharDataInternal(ctxt, 0);
+}
+
+/**
  * htmlParseExternalID:
  * @ctxt:  an HTML parser context
  * @publicID:  a xmlChar** receiving PubidLiteral
@@ -4157,14 +4177,22 @@
 	    }
 
 	    /*
-	     * Third case :  a sub-element.
+	     * Third case : (unescaped) stand-alone less-than character.
+	     */
+	    else if ((CUR == '<') && (IS_BLANK_CH(NXT(1)) || (NXT(1) == '='))) {
+		NEXT;
+		htmlParseCharDataInternal(ctxt, '<');
+	    }
+
+	    /*
+	     * Fourth case :  a sub-element.
 	     */
 	    else if (CUR == '<') {
 		htmlParseElement(ctxt);
 	    }
 
 	    /*
-	     * Fourth case : a reference. If if has not been resolved,
+	     * Fifth case : a reference. If if has not been resolved,
 	     *    parsing returns it's Name, create the node
 	     */
 	    else if (CUR == '&') {
@@ -4172,7 +4200,7 @@
 	    }
 
 	    /*
-	     * Fifth case : end of the resource
+	     * Sixth case : end of the resource
 	     */
 	    else if (CUR == 0) {
 		htmlAutoCloseOnEnd(ctxt);
@@ -4567,7 +4595,15 @@
 	    }
 
 	    /*
-	     * Third case :  a sub-element.
+	     * Third case : (unescaped) stand-alone less-than character.
+	     */
+	    else if ((CUR == '<') && (IS_BLANK_CH(NXT(1)) || (NXT(1) == '='))) {
+		NEXT;
+		htmlParseCharDataInternal(ctxt, '<');
+	    }
+
+	    /*
+	     * Fourth case :  a sub-element.
 	     */
 	    else if (CUR == '<') {
 		htmlParseElementInternal(ctxt);
@@ -4578,7 +4614,7 @@
 	    }
 
 	    /*
-	     * Fourth case : a reference. If if has not been resolved,
+	     * Fifth case : a reference. If if has not been resolved,
 	     *    parsing returns it's Name, create the node
 	     */
 	    else if (CUR == '&') {
@@ -4586,7 +4622,7 @@
 	    }
 
 	    /*
-	     * Fifth case : end of the resource
+	     * Sixth case : end of the resource
 	     */
 	    else if (CUR == 0) {
 		htmlAutoCloseOnEnd(ctxt);

_______________________________________________
xml mailing list, project page  http://xmlsoft.org/
xml@gnome.org
https://mail.gnome.org/mailman/listinfo/xml

[xml] [PATCH] less-than character and HTML parser module

Reply via email to