En Tue, 01 May 2007 05:22:49 -0300, eC <[EMAIL PROTECTED]> escribió:
> On Apr 30, 9:41 am, Steven D'Aprano <[EMAIL PROTECTED]> > wrote: >> On Mon, 30 Apr 2007 00:45:22 -0700, OhKyu Yoon wrote: >> > I have a really long binary file that I want to read. >> > The way I am doing it now is: >> >> > for i in xrange(N): # N is about 10,000,000 >> > time = struct.unpack('=HHHH', infile.read(8)) >> > # do something >> > tdc = struct.unpack('=LiLiLiLi',self.lmf.read(32)) >> >> Disk I/O is slow, so don't read from files in tiny little chunks. Read a >> bunch of records into memory, then process them. >> >> # UNTESTED! >> rsize = 8 + 32 # record size >> for i in xrange(N//1000): >> buffer = infile.read(rsize*1000) # read 1000 records at once >> for j in xrange(1000): # process each record >> offset = j*rsize >> time = struct.unpack('=HHHH', buffer[offset:offset+8]) >> # do something >> tdc = struct.unpack('=LiLiLiLi', buffer[offset+8:offset+rsize]) >> # do something >> >> (Now I'm just waiting for somebody to tell me that file.read() already >> buffers reads...) > > I think the file.read() already buffers reads... :) Now we need someone to actually measure it, to confirm the expected behavior... Done. --- begin code --- import struct,timeit,os fn = r"c:\temp\delete.me" fsize = 1000000 if not os.path.isfile(fn): f = open(fn, "wb") f.write("\0" * fsize) f.close() os.system("sync") def smallreads(fn): rsize = 40 N = fsize // rsize f = open(fn, "rb") for i in xrange(N): # N is about 10,000,000 time = struct.unpack('=HHHH', f.read(8)) tdc = struct.unpack('=LiLiLiLi', f.read(32)) f.close() def bigreads(fn): rsize = 40 N = fsize // rsize f = open(fn, "rb") for i in xrange(N//1000): buffer = f.read(rsize*1000) # read 1000 records at once for j in xrange(1000): # process each record offset = j*rsize time = struct.unpack('=HHHH', buffer[offset:offset+8]) tdc = struct.unpack('=LiLiLiLi', buffer[offset+8:offset+rsize]) f.close() print "smallreads", timeit.Timer("smallreads(fn)","from __main__ import fn,smallreads,fsize").repeat(3,1) print "bigreads", timeit.Timer("bigreads(fn)", "from __main__ import fn,bigreads,fsize").repeat(3,1) --- end code --- Output: smallreads [4.2534193777646663, 4.126013885559789, 4.2389176672125458] bigreads [1.2897319939456011, 1.3076018578892405, 1.2703250635695138] So in this sample case, reading in big chunks is about 3 times faster than reading many tiny pieces. -- Gabriel Genellina -- http://mail.python.org/mailman/listinfo/python-list