Very new to Python, running 2.5 on windows. I am processing an XML file (7.2MB). Using the standard library I am recursively processing each node and parsing it. The branches don't go particularly deep. What is happening is that the program is running really really slowly, so slow that even running it over night, it still doesn't finish. Stepping through it I have noticed that memory usage has shot up from 190MB to 624MB and continues to climb. If I set a break point and then stop the program the memory is not released. It is not until I shutdown PythonWin that the memory gets released. I thought this might mean objects were not getting GCed, so through the interactive window I imported gc. gc.garbage is empty. gc.collect() seems to fix the problem (after much thinking) and reports 2524104. Running it again returns 0. I thought that garbage collection was automatic, if I use variables in a method do I have to del them? I tried putting a "del node" in all my for node in .... loops but that didn't help. collect() reports the same number. Tried putting gc.collect() at the end of the loops but that didn't help either. If I have the program at a break and do gc.collect() it doesn't fix it, so whatever referencing is causing problems is still active. My program is parsing the XML and generating a Python program for SQLalchemy, but the program never gets a chance to run the memory problem is prior to that. It probably has something to do with the way I am string building.
My apologies for the long post but without being able to see the code I doubt anyone can give me a solid answer so here it goes (sorry for the lack of comments): from xml.dom import minidom import os import gc class xmlProcessing: """ General class for XML processing""" def process(self, filename="", xmlString=""): if xmlString: pass elif filename: xmldoc = minidom.parse(filename) self.parse( xmldoc.documentElement ) def parseBranch(self, parentNode): """ Process an XML branch """ for node in parentNode.childNodes: try: parseMethod = getattr(self, "parse_%s" % node.__class__.__name__) except AttributeError: continue if parseMethod(node): continue self.parseBranch(node) del node def parse_Document(self, node): pass def parse_Text(self, node): pass def parse_Comment(self, node): pass def parse_Element(self, node): try: handlerMethod = getattr(self, "do_%s" % node.tagName) except AttributeError: return False handlerMethod(node) return True class reptorParsing(xmlProcessing): """ Specific class for generating a SQLalchemy program to create tables and populate them with data""" def __init__(self): self.schemaPreface = """from sqlalchemy import * from sqlalchemy.ext.declarative import declarative_base engine = create_engine('sqlite:///tutorial.db', echo=False) metadata = MetaData() Base = declarative_base()""" self.schemaTables = "" self.schemaFields = "" self.dataUpdate = "" self.tableDict = {} self.tableName = "" self.tables = "" def parse(self, parentNode): """Main entry point to begin processing a XML document""" self.parseBranch(parentNode) # Properties such as schemaTables and .tables are populated by the various methods below fupdate=open(os.path.join(os.getcwd(), "update.py"), 'w') if self.schemaTables: fupdate.write("import schema\n") f=open(os.path.join(os.getcwd(), "schema.py"), 'w') f.write(self.schemaPreface+"\n"+self.schemaTables+ '\n' + "metadata.create_all(engine)\n"+ "print 'hello 2'") f.close() if self.tables: fupdate.write(self.tables) # f=open(os.path.join(os.getcwd(), "dataUpdate.py"), 'w') # f.write(self.dataUpdate) # f.close() fupdate.close() def do_TABLES(self, tableNode): """Process schema for tables""" for node in tableNode.childNodes: self.tableName = node.tagName # Define a declaritive mapping class self.schemaTables += """\nclass %s(Base): __tablename__ = '%s' """ % (self.tableName, self.tableName) self.schemaFields = "" # allow for userA = users("Billy","Bob") via a __init__() self.schemaInitPreface = " def __init__(self" self.schemaInitBody = "" self.parseBranch(node) self.schemaInitPreface += "):\n" self.schemaTables += self.schemaFields + "\n" + \ self.schemaInitPreface + \ self.schemaInitBody + "\n" gc.collect() def do_FIELDS(self, fieldsNode): """Process schema for fields within tables""" for node in fieldsNode.childNodes: if self.schemaFields: self.schemaFields += "\n" cType = "" # The attribute type holds the type of field crType = node.attributes["type"].value if crType==u"C": cType = "String(length=%s)" % node.attributes["len"].value elif crType==u"N" and node.attributes["dec"].value==u'0': cType = "Integer" elif crType==u"N": cType = "Numeric(precision=%s, scale=%s)" % (node.attributes["len"].value,node.attributes["dec"].value) elif crType==u"L": cType = "Boolean" elif crType==u"T": cType = "DateTime" elif crType==u"D": cType = "Date" elif crType==u"M" or crType==u"G": cType = "Text" if node.attributes.getNamedItem("primary"): cType += ", primary_key=True" self.schemaFields += " %s = Column(%s)" % (node.tagName, cType) self.schemaInitPreface += ", \\\n %s" % (node.tagName) self.schemaInitBody += " self.%s = %s\n" % (node.tagName, node.tagName) self.tableDict[self.tableName + "." + node.tagName] = crType del node def do_DATA(self, dataNode): """This is for processing actual data to be pushed into the tables Layout is DATA -> TABLE_NAME key='primary_field' -> TUPLE -> FIELD_NAME -> VALUE""" for node in dataNode.childNodes: self.dataUpdate = """ import time from datetime import * from sqlalchemy import * from sqlalchemy.orm import * engine = create_engine('sqlite:///tutorial.db', echo=False) Session = sessionmaker() Session.configure(bind=engine) session = Session() """ self.keyValue = "" self.keyField = node.attributes["key"].value self.tableName = node.tagName self.parseBranch(node) self.tables += "\nimport %s_update.py" % (self.tableName) f=open(os.path.join(os.getcwd(), self.tableName + "_update.py"), 'w') f.write(self.dataUpdate) f.close() gc.collect() def do_TUPLE(self, tupleNode): """ A TUPLE is what the XML file refers to a table row Sits below a DATA child""" self.dataUpdate += """ entry = %s() session.add(entry) """ % (self.tableName) for node in tupleNode.childNodes: for dataNode in node.childNodes: crType = self.tableDict[self.tableName + "." + node.tagName] if crType==u"C" or crType==u"M": cValue = '"""%s"""' % dataNode.data elif crType==u"T": cValue = 'datetime.strptime("'+dataNode.data+'", "%Y-%m-%d %H:%M")' elif crType==u"D": cValue = 'datetime.strptime("'+dataNode.data+'", "%Y-%m-%d")' else: cValue = dataNode.data self.dataUpdate += "\nentry.%s = %s" % (node.tagName, cValue) del dataNode self.dataUpdate += "\nsession.commit()" del node if __name__ == '__main__': replicate = reptorParsing() replicate.process(filename=os.path.join(os.getcwd(), "request.xml")) import update -- http://mail.python.org/mailman/listinfo/python-list