Hi, I'm parsing XML files using ElementTree from xml.etree (see code below (and attached xml_parse_example.py)).
However, I'm coming across input XML files (attached an example: tmp.xml) which include invalid characters, that produce the following traceback: $ python xml_parse_example.py Traceback (most recent call last): File "xml_parse_example.py", line 63, in <module> tree = xml2dict.open_and_parse_xml_file() File "xml_parse_example.py", line 14, in open_and_parse_xml_file tree = ElementTree.parse(f) File "c:\Python26\lib\xml\etree\ElementTree.py", line 862, in parse tree.parse(source, parser) File "c:\Python26\lib\xml\etree\ElementTree.py", line 586, in parse parser.feed(data) File "c:\Python26\lib\xml\etree\ElementTree.py", line 1245, in feed self._parser.Parse(data, 0) xml.parsers.expat.ExpatError: not well-formed (invalid token): line 6, column 34 I read the documentation for xml.etree.ElementTree and see that it may take an optional parser parameter, but I don't know what this parser should be - to ignore the invalid characters. Could you suggest a way to call ElementTree, so it won't bomb on these invalid characters ? Thanks, Ron. ________________________________ #!/usr/bin/env python from xml.etree import ElementTree import pprint compute_tail = False class XmlFileToDict(): def __init__(self, xml_file_path): self.xml_file_path = xml_file_path def open_and_parse_xml_file(self): with open(self.xml_file_path, 'rt') as f: tree = ElementTree.parse(f) return tree def dict_list(self, node): res = {} res[node.tag] = [] self.xml_to_dict(node,res[node.tag]) reply = {} if compute_tail: reply[node.tag] = {'value':res[node.tag],'attribs':node.attrib,'tail':node.tail} else: reply[node.tag] = {'value':res[node.tag],'attribs':node.attrib} return reply def xml_to_dict(self, node, res): rep = {} if len(node): #n = 0 for n in list(node): rep[node.tag] = [] value = self.xml_to_dict(n,rep[node.tag]) if len(n): if compute_tail: value = {'value':rep[node.tag],'attributes':n.attrib,'tail':n.tail} else: value = {'value':rep[node.tag],'attributes':n.attrib} res.append({n.tag:value}) else : res.append(rep[node.tag][0]) else: value = {} if compute_tail: value = {'value':node.text,'attributes':node.attrib,'tail':node.tail} else: value = {'value':node.text,'attributes':node.attrib} res.append({node.tag:value}) return if __name__ == '__main__' : xml_file_path ='tmp.xml' xml2dict = XmlFileToDict(xml_file_path) tree = xml2dict.open_and_parse_xml_file() xml_dict = xml2dict.dict_list(tree.getroot()) pprint.pprint(xml_dict) ________________________________
tmp.xml
Description: tmp.xml
xml_parse_example.py
Description: xml_parse_example.py
-- http://mail.python.org/mailman/listinfo/python-list