Thanks all, took your advice and have been playing all weekend which has been great fun. ElementTree is awesome. I created a script that organises the xml as they're in year blocks and I didn't realise the required xml is mixed up with other xml. Plus the volumes are much greater than I realised, I checked as back at work and it was something like 600,000 files in a year, just over a gig for each year.
I'm going to add zipping up of the files and getting the required info and putting it in a db this week hopefully. It's been completely overhauled, originally I used modified date now it gets the date from the parsed xml, safer that way. The code is below but word of caution, it's hobbyist code so it'll probably make your eyes bleed =), thanks again: There was one thing that I forgot about - when ElementTree fails to parse due to an element not being closed why doesn't it close the file like object. As later on I would raise 'WindowsError: [Error 32] ...file being used by other process' when using shutil.move(). I got round this by using a 'try except' block. from __future__ import print_function import xml.etree.cElementTree as ET import calendar import zipfile import os.path import shutil import zlib import os class Xmlorg(object): def __init__(self): self.cwd = os.getcwd() self.year = os.path.basename(self.cwd) def _mkMonthAndDaysDirs(self): ''' creates dirs for every month and day of a of specidifed year. Works for leap years as well. (specified)year/(year)month/day ...2010/201001/01 ...2010/201001/02 ...2010/201001/03 ''' def addZero(n): if len(str(n)) < 2: return '0' + str(n) else: return str(n) dim = [ calendar.monthrange(year,month)[1] for year in \ [int(self.year)] for month in range(1,13) ] count = 1 for n in dim: month = addZero(count) count += 1 ym = os.path.join(self.cwd, self.year + month) os.mkdir(ym) for x in range(1,n+1): x = addZero(x) os.mkdir(os.path.join(ym, x)) def ParseAndOrg(self): '''requires dir and zip struct: .../(year)/(year).zip - example .../2008/2008.zip ''' def movef(fp1,fp2): '''moves files with exception handling''' try: shutil.move(fp1,fp2) except IOError, e: print(e) except WindowsError, e: print(e) self._mkMonthAndDaysDirs() os.mkdir(os.path.join(self.cwd, 'otherFileType')) # dir struct .../(year)/(year).zip - ex. .../2008/2008.zip zf = zipfile.ZipFile(os.path.join(self.cwd, self.year + '.zip')) zf.extractall() ld = os.listdir(self.cwd) for i in ld: if os.path.isfile(i) and i.endswith('.xml'): try: tree = ET.parse(i) except: print('%s np' % i) #not parsed root = tree.getroot() if root.findtext('Summary/FileType') == 'Order': date = root.findtext('OrderHeader/OrderDate')[:10] #dd/mm/yyyy dc = date.split('/') fp1 = os.path.join(self.cwd, i) fp2 = os.path.join(self.cwd, dc[2] + dc[1], dc[0]) movef(fp1,fp2) else: fp1 = os.path.join(self.cwd, i) fp2 = os.path.join(self.cwd, 'otherFileType') movef(fp1,fp2) if __name__ == '__main__': os.chdir('c:/sv_zip_test/2010/') #remove xo = Xmlorg() xo.ParseAndOrg() -- http://mail.python.org/mailman/listinfo/python-list