and the tweak is: parser = etree.HTMLParser(recover=False) return etree.HTML(xml, parser)
That reduces tolerance. The entire assert_xml() is (apologies for wrapping lines!): def _xml_to_tree(self, xml): from lxml import etree self._xml = xml try: if '<html' in xml[:200]: # NOTE the condition COULD suck more! parser = etree.HTMLParser(recover=False) return etree.HTML(xml, parser) return etree.HTML(xml) else: return etree.XML(xml) except ValueError: # TODO don't rely on exceptions for normal control flow tree = xml self._xml = str(tree) # CONSIDER does this reconstitute the nested XML ? return tree def assert_xml(self, xml, xpath, **kw): 'Check that a given extent of XML or HTML contains a given XPath, and return its first node' tree = self._xml_to_tree(xml) nodes = tree.xpath(xpath) self.assertTrue(len(nodes) > 0, xpath + ' not found in ' + self._xml) node = nodes[0] if kw.get('verbose', False): self.reveal_xml(node) # "here have ye been? What have ye seen?"--Morgoth return node def reveal_xml(self, node): 'Spews an XML node as source, for diagnosis' from lxml import etree print etree.tostring(node, pretty_print=True) # CONSIDER does pretty_print work? why not? def deny_xml(self, xml, xpath): 'Check that a given extent of XML or HTML does not contain a given XPath' tree = self._xml_to_tree(xml) nodes = tree.xpath(xpath) self.assertEqual(0, len(nodes), xpath + ' should not appear in ' + self._xml) -- http://mail.python.org/mailman/listinfo/python-list