How to get xml.etree.ElementTree not bomb on invalid characters in XML file ?

Barak, Ron Tue, 04 May 2010 00:08:55 -0700

Hi,

I'm parsing XML files using ElementTree from xml.etree (see code below (and 
attached xml_parse_example.py)).


However, I'm coming across input XML files (attached an example: tmp.xml) which 
include invalid characters, that produce the following traceback:

$ python xml_parse_example.py
Traceback (most recent call last):
  File "xml_parse_example.py", line 63, in <module>
    tree = xml2dict.open_and_parse_xml_file()
  File "xml_parse_example.py", line 14, in open_and_parse_xml_file
    tree = ElementTree.parse(f)
  File "c:\Python26\lib\xml\etree\ElementTree.py", line 862, in parse
    tree.parse(source, parser)
  File "c:\Python26\lib\xml\etree\ElementTree.py", line 586, in parse
    parser.feed(data)
  File "c:\Python26\lib\xml\etree\ElementTree.py", line 1245, in feed
    self._parser.Parse(data, 0)
xml.parsers.expat.ExpatError: not well-formed (invalid token): line 6, column 34

I read the documentation for xml.etree.ElementTree and see that it may take an 
optional parser parameter, but I don't know what this parser should be - to 
ignore the invalid characters.

Could you suggest a way to call ElementTree, so it won't bomb on these invalid 
characters ?

Thanks,
Ron.

________________________________

#!/usr/bin/env python

from xml.etree import ElementTree
import pprint

compute_tail = False

class XmlFileToDict():
    def __init__(self, xml_file_path):
        self.xml_file_path = xml_file_path

    def open_and_parse_xml_file(self):
        with open(self.xml_file_path, 'rt') as f:
            tree = ElementTree.parse(f)
        return tree

    def dict_list(self, node):
            res = {}
            res[node.tag] = []
            self.xml_to_dict(node,res[node.tag])
            reply = {}
            if compute_tail:
                reply[node.tag] = 
{'value':res[node.tag],'attribs':node.attrib,'tail':node.tail}
            else:
                reply[node.tag] = {'value':res[node.tag],'attribs':node.attrib}

            return reply

    def xml_to_dict(self, node, res):
            rep = {}

            if len(node):
                    #n = 0
                    for n in list(node):
                            rep[node.tag] = []
                            value = self.xml_to_dict(n,rep[node.tag])
                            if len(n):
                                    if compute_tail:
                                        value = 
{'value':rep[node.tag],'attributes':n.attrib,'tail':n.tail}
                                    else:
                                        value = 
{'value':rep[node.tag],'attributes':n.attrib}
                                    res.append({n.tag:value})
                            else :

                                    res.append(rep[node.tag][0])

            else:


                    value = {}
                    if compute_tail:
                        value = 
{'value':node.text,'attributes':node.attrib,'tail':node.tail}
                    else:
                        value = {'value':node.text,'attributes':node.attrib}

                    res.append({node.tag:value})

            return

if __name__ == '__main__' :
    xml_file_path ='tmp.xml'
    xml2dict = XmlFileToDict(xml_file_path)
    tree = xml2dict.open_and_parse_xml_file()
    xml_dict = xml2dict.dict_list(tree.getroot())
    pprint.pprint(xml_dict)

________________________________

tmp.xml
Description: tmp.xml

xml_parse_example.py
Description: xml_parse_example.py

-- 
http://mail.python.org/mailman/listinfo/python-list

How to get xml.etree.ElementTree not bomb on invalid characters in XML file ?

Reply via email to