On Jun 3, 3:59 pm, Chris Rebert <c...@rebertia.com> wrote: > On Thu, Jun 3, 2010 at 1:44 PM, bfrederi <brfrederi...@gmail.com> wrote: > > I am using lxml iterparse and running into a very obscure error. When > > I run iterparse on a file, it will occasionally return an element that > > has a element.text == None when the element clearly has text in it. > > > I copy and pasted the problem xml into a python string, used StringIO > > to create a file-like object out of it, and ran a test using iterparse > > with expected output, and it ran perfectly fine. So it only happens > > when I try to run iterparse on the actual file. > > > So then I tried opening the file, reading the data, turning that data > > into a file-like object using StringIO, then running iterparse on it, > > and the same problem (element.text == None) occurred. > > > I even tried this: > > f = codecs.open(abbyy_filename, 'r', encoding='utf-8') > > file_data = f.read() > > file_like_object = StringIO.StringIO(file_data) > > for event, element in iterparse(file_like_object, events=("start", > > "end")): > > IIRC, XML parsers operate on bytes directly (since they have to > determine the encoding themselves anyway), not pre-decoded Unicode > characters, so I think your manual UTF-8 decoding could be the > problem. > Have you tried simply: > > f = open(abbyy_filename, 'r') > for event, element in iterparse(f, events=("start", "end")): > #whatever > > ? > > Apologies if you already have, but since you didn't include the > original, albeit probably trivial, error-causing code, this relatively > simple error couldn't be ruled out. > > Cheers, > Chris > --http://blog.rebertia.com
Sorry for not mentioning it, but I tried that as well and it failed. Here is the relevant class. AbbyyLine and Abbyyword just take the element's text and writes it to a file/file-like object. parse_doc is where I use iterparse. The relevant part is very minimal and there is a lot of fluff to ignore, so I didn't initially post it: class AbbyyDocParse(object): """Takes an abbyy filename and parses the contents""" def __init__(self, abbyy_filename, extension=DEFAULT_ABBYY_EXT, format_list=OUTPUT_TYPES, string_only=False): self.extension = extension self.format_list = format_list #Create the file handles for the output files self.create_filehandles(abbyy_filename, string_only) #Parse the document self.parse_doc(abbyy_filename) #Close the output filehandles self.close_filehandles(abbyy_filename, string_only) def create_filehandles(self, abbyy_filename, string_only): """Create output filehandles""" #if output goes to a file if not string_only: #Make sure the file is an abbyy file if not abbyy_filename.endswith(self.extension): raise ParserException, "Bad abbyy filename given: %s" \ % (abbyy_filename) #get the base path and filename for output files filename = abbyy_filename.replace(self.extension, '') #Loop through the different formats for format_type in self.format_list: #if output goes to a file if not string_only: #Create output filename out_file = "%s%s" % (filename, OUTPUT_EXTENSIONS.get(format_type)) #Opens the format type filehandle try: setattr(self, "%s_handle" % (format_type), open(out_file,'w')) except: raise IOError, "Could not open file: %s" % (out_file) #if output goes to a string else: #Opens the format type StringIO try: setattr(self, "%s_handle" % (format_type), StringIO.StringIO()) except: raise IOError, "Could not open string output: %s" % (out_file) def parse_doc(self, abbyy_filename): """Parses the abbyy document""" #Write the first line of the xml doc, if specified if getattr(self, 'xml_handle', None): self.xml_handle.write('<?xml version="1.0" encoding="utf-8"?>\n') #Memory efficient iterparse opens file and loops through content for event, element in iterparse(abbyy_filename, events=("start", "end")): #ignore the namespace, if it has one if NAMESPACE_REGEX.search(element.tag, 0): element_tag = NAMESPACE_REGEX.search(element.tag, 0).group(1) else: element_tag = element.tag #if this is the page element if element_tag == 'page': self.write_page(event, element) #If at the beginning of the line elif element_tag == 'line' and event == 'start': #Create the line line = AbbyyLine(element) #Instantiate first word word = AbbyyWord(line) #If at the end of the line, and an output text file exists if element_tag == 'line' and event == 'end' and \ getattr(self, 'text_handle', None): #output line data to text file line.write_line(self.text_handle) #If at the end of the line, and an output text file exists if element_tag == 'line' and event == 'end' and \ getattr(self, 'xml_handle', None): #output line data to text file word.write_word(self.xml_handle) #if outputting to an xml file, create word data if getattr(self, 'xml_handle', None) and \ element_tag == 'charParams' and event == 'start': #Insert character into word word.insert_char(element, self.xml_handle) #if outputting to a text file, create line data if getattr(self, 'text_handle', None) and \ element_tag == 'charParams' and event == 'start': #Insert character into line line.insert_char(element) def write_page(self, event, element): """Parse the page contents""" #page open tag event if event == 'start': #Write page info to xml file if getattr(self, 'xml_handle', None): #Get the page info x_dim = element.get('width') y_dim = element.get('height') resolution = element.get('resolution') #Write the page info to the file self.xml_handle.write('<page>\n') self.xml_handle.write('<filename/>\n') self.xml_handle.write('<confidence/>\n') self.xml_handle.write("<xDim>%s</xDim>\n" % (x_dim)) self.xml_handle.write("<yDim>%s</yDim>\n" % (y_dim)) self.xml_handle.write("<resolution>%s</resolution>\n" % (resolution)) self.xml_handle.write('<zone/>\n') self.xml_handle.write('<wordsboundingboxes>\n') #page close tag event elif event == 'end': #Write page info to xml file if getattr(self, 'xml_handle', None): #Write closing tags to file self.xml_handle.write('</wordsboundingboxes>\n') self.xml_handle.write('</page>') def write_line(self, event, element): """Parse the line contents""" #line open tag event if event == 'start': pass #page close tag event elif event == 'end': pass def write_word(self, event, element): """Parse the charParams contents""" pass def close_filehandles(self, abbyy_filename, string_only): """Close the open filehandles""" #if the files exist if not string_only: #Loop through the different formats for format_type in self.format_list: #Opens the format type filehandle try: getattr(self, "%s_handle" % (format_type)).close() except: raise IOError, "Could not close format type: %s for file: %s" \ % (format_type, abbyy_filename) -- http://mail.python.org/mailman/listinfo/python-list