Re: CSV performance

Scott David Daniels Mon, 27 Apr 2009 11:20:44 -0700

psaff...@googlemail.com wrote:

Thanks for your replies. Many apologies for not including the right
information first time around. More information is below....

Here is another way to try (untested):


import numpy
import time

chrommap = dict(chrY='y', chrX='x', chr13='c', chr12='b', chr11='a',
                chr10='0', chr17='g', chr16='f', chr15='e', chr14='d',
                chr19='i', chr18='h', chrM='m', chr22='l', chr20='j',
                chr21='k', chr7='7', chr6='6', chr5='5', chr4='4',
                chr3='3', chr2='2', chr1='1', chr9='9', chr8='8')

def consume_file(file_name, chunks)
    numpy.zeros(size_guess)
    lx = []
    cx = []
    px = []
    block = []
    with open(file_name) as fh:
        for line in enumerate(fh):
            chrom, coord, point = row.split()
            lx.append(chrommap[chrom])
            cx.append(coord)
            px.append(point)
            if len(cx) >= chunks:
                block.append(.''.join(lx))
                block.append(numpy.array(cx, dtype=int))
                block.append(numpy.array(px, dtype=float))
                lx = []
                cx = []
                px = []
        if lx:
            block.append(.''.join(lx))
            block.append(numpy.array(cx))
            block.append(numpy.array(px))

    return (''.join(block[0::3]),
            numpy.concatenate(block[1::3]),
            numpy.concatenate(block[2::3]))


# The following repeats 128, to avoid initial read issues.
# Treat the diff twixt the two 128s as read overhead.
for CHUNKS in 128, 128, 256, 1024, 4096, 16384:
    t0 = time.clock()
    letters, coords, points = consume_file("largefile.txt", CHUNKS)
    t1 = time.clock()
    print "finished %s in %s chunks: %f.2" % (
         len(letters), CHUNKS, t1 - t0)


--Scott David Daniels
scott.dani...@acm.org
--
http://mail.python.org/mailman/listinfo/python-list

Re: CSV performance

Reply via email to