I have a simple program to read a text (.csv) file and split it into several smaller files. Tonight I decided to write a unicode variant and was surprised at the difference in performance. Is there a better way?
> from __future__ import with_statement > import codecs > > def _rowreader(filename, separator='\t'): > """Generator for iteration over potentially large file.""" > with codecs.open(filename, 'rU', 'utf-8', 'backslashreplace') as tabfile: > > for row in tabfile: > yield [v.strip() for v in row.split(separator)] > > def generator_of_output(source_of_lines): > for line in source_of_lines: > for result in some_function(line): > yield result > > def coroutine(outfile_prefix, outfile_suffix, sep='\t'): > outfile = '%s_%s.txt'% (outfile_prefix, outfile_suffix) > with codecs.open(outfile, 'w', 'utf-8') as out_part: > while True: > line = (yield) > out_part.write(sep.join(line) + '\n') > > def _file_to_files(infile, outfile_prefix, column, sep): > column_values = dict() > for line in _rowreader(infile, sep): > outfile_suffix = line[column].strip('\'\"') > if outfile_suffix in column_values: > column_values[outfile_suffix].send(line) > else: > file_writer = coroutine(outfile_prefix, outfile_suffix, sep) > file_writer.next() > file_writer.send(line) > column_values[outfile_suffix] = file_writer > for file_writer in column_values.itervalues(): > file_writer.close() the plain version is the same except for > with open(filename, 'rU') as tabfile: > with open(outfile, 'wt') as out_part: The difference: > "uid","timestamp","taskid","inputid","value" > "15473178739336026589","2010-02-18T20:50:15+0000","11696870405","73827093507","83523277829" > "15473178739336026589","2010-02-18T20:50:15+0000","11696870405","11800677379","12192844803" > "15473178739336026589","2010-02-18T20:50:15+0000","11696870405","31231839235","52725552133" > > sys...@bembo:~/UCLC/bbc/wb2$ wc -l wb.csv > 9293271 wb.csv > > normal version > sys...@bembo:~/UCLC$ time ~/UCL/toolkit/file_splitter.py -o tt --separator > comma -k 2 wb.csv > > real 0m43.714s > user 0m37.370s > sys 0m2.732s > > unicode version > sys...@bembo:~/UCLC$ time ./file_splitter.py -o t --separator comma -k 2 > wb.csv > > real 4m8.695s > user 3m19.236s > sys 0m39.262s -- David Clark, MSc, PhD. UCL Centre for Publishing Gower Str London WCIE 6BT What sort of web animal are you? <https://www.bbc.co.uk/labuk/experiments/webbehaviour> -- http://mail.python.org/mailman/listinfo/python-list