Marko Rauhamaa wrote: > Random832 <random...@fastmail.com>: >> You know what would be really nice? A "semi-incremental" parser that >> can e.g. yield (whether through an event or through the iterator >> protocol) a fully formed element (preferably one that can be queried >> with xpath) at a time for each record of a document representing a >> list of objects. Does anything like that exist? > > You can construct that from a SAX parser, but it's less convenient than > it could be. Python's JSON parser doesn't have it so I've had to build a > clumsy one myself: > > def decode_json_object_array(self): > # A very clumsy implementation of an incremental JSON > # decoder > it = self.get_text() > inbuf = "" > while True: > try: > inbuf += next(it) > except StopIteration: > # a premature end; trigger a decode error > json.loads("[" + inbuf) > try: > head, tail = inbuf.split("[", 1) > except ValueError: > continue > break > # trigger a decode error if head contains junk > json.loads(head + "[]") > inbuf = "" > chunk = tail > while True: > bracket_maybe = "" > for big in chunk.split("]"): > comma_maybe = "" > for small in big.split(","): > inbuf += comma_maybe + small > comma_maybe = "," > try: > yield json.loads(inbuf) > #except json.JSONDecodeError: > except ValueError: # legacy exception > pass > else: > inbuf = comma_maybe = "" > inbuf += bracket_maybe > bracket_maybe = "]" > try: > yield json.loads(inbuf) > #except json.JSONDecodeError: > except ValueError: # legacy exception > pass > else: > inbuf = "" > try: > chunk += next(it) > except StopIteration: > break > # trigger a decode error if chunk contains junk > json.loads("[" + chunk) > > It could easily be converted to an analogous XML parser.
For XML you could use iterparse, see http://effbot.org/elementtree/iterparse.htm I came up with the following and found memory usage to be stable. import random import xml.etree.ElementTree from xml.sax.saxutils import escape def iter_elems(file, tag): it = xml.etree.ElementTree.iterparse(file, events=("start", "end")) root = next(it)[1] for event, elem in it: if event == "end" and elem.tag == tag: yield elem root.clear() # --- example below --- class NeverendingXMLFile: def __init__(self, words): self.words = words self.chunks = self.gen_chunks() def gen_chunks(self): words = self.words yield b"<doc>" while True: yield "<word>{}</word>".format(random.choice(words)).encode() def read(self, size=None): return next(self.chunks) def filelike(): with open("/usr/share/dict/words") as f: words = [escape(line.strip()) for line in f] infile = NeverendingXMLFile(words) return infile if __name__ == "__main__": for word in iter_elems(filelike(), "word"): print(word.text) In theory this should be even simpler with lxml as it exposes the root element and allows to filter per tag http://lxml.de/parsing.html#iterparse-and-iterwalk Unfortunately root seems to be set after the closing </...> and thus doesn't help with dereferencing seen elements during iteration. -- https://mail.python.org/mailman/listinfo/python-list