psaff...@googlemail.com wrote:
Thanks for your replies. Many apologies for not including the right information first time around. More information is below....
Here is another way to try (untested):
import numpy import time chrommap = dict(chrY='y', chrX='x', chr13='c', chr12='b', chr11='a', chr10='0', chr17='g', chr16='f', chr15='e', chr14='d', chr19='i', chr18='h', chrM='m', chr22='l', chr20='j', chr21='k', chr7='7', chr6='6', chr5='5', chr4='4', chr3='3', chr2='2', chr1='1', chr9='9', chr8='8') def consume_file(file_name, chunks) numpy.zeros(size_guess) lx = [] cx = [] px = [] block = [] with open(file_name) as fh: for line in enumerate(fh): chrom, coord, point = row.split() lx.append(chrommap[chrom]) cx.append(coord) px.append(point) if len(cx) >= chunks: block.append(.''.join(lx)) block.append(numpy.array(cx, dtype=int)) block.append(numpy.array(px, dtype=float)) lx = [] cx = [] px = [] if lx: block.append(.''.join(lx)) block.append(numpy.array(cx)) block.append(numpy.array(px)) return (''.join(block[0::3]), numpy.concatenate(block[1::3]), numpy.concatenate(block[2::3])) # The following repeats 128, to avoid initial read issues. # Treat the diff twixt the two 128s as read overhead. for CHUNKS in 128, 128, 256, 1024, 4096, 16384: t0 = time.clock() letters, coords, points = consume_file("largefile.txt", CHUNKS) t1 = time.clock() print "finished %s in %s chunks: %f.2" % ( len(letters), CHUNKS, t1 - t0) --Scott David Daniels scott.dani...@acm.org -- http://mail.python.org/mailman/listinfo/python-list