I would like some feedback on possible solutions to make this script run faster. The system is pegged at 100% CPU and it takes a long time to complete.
#!/usr/bin/env python import gzip import re import os import sys from datetime import datetime import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-f', dest='inputfile', type=str, help='data file to parse') parser.add_argument('-o', dest='outputdir', type=str, default=os.getcwd(), help='Output directory') args = parser.parse_args() if len(sys.argv[1:]) < 1: parser.print_usage() sys.exit(-1) print(args) if args.inputfile and os.path.exists(args.inputfile): try: with gzip.open(args.inputfile) as datafile: for line in datafile: line = line.replace('mediacdn.xxx.com', 'media.xxx.com') line = line.replace('staticcdn.xxx.co.uk', ' static.xxx.co.uk') line = line.replace('cdn.xxx', 'www.xxx') line = line.replace('cdn.xxx', 'www.xxx') line = line.replace('cdn.xx', 'www.xx') siteurl = line.split()[6].split('/')[2] line = re.sub(r'\bhttps?://%s\b' % siteurl, "", line, 1) (day, month, year, hour, minute, second) = (line.split()[3]).replace('[','').replace(':','/').split('/') datelog = '{} {} {}'.format(month, day, year) dateobj = datetime.strptime(datelog, '%b %d %Y') outfile = '{}{}{}_combined.log'.format(dateobj.year, dateobj.month, dateobj.day) outdir = (args.outputdir + os.sep + siteurl) if not os.path.exists(outdir): os.makedirs(outdir) with open(outdir + os.sep + outfile, 'w+') as outf: outf.write(line) except IOError, err: sys.stderr.write("Error unable to read or extract inputfile: {} {}\n".format(args.inputfile, err)) sys.exit(-1)
-- http://mail.python.org/mailman/listinfo/python-list