On Monday 07 March 2005 14:36, Joerg Schuster wrote: > Any ideas? The following program should do the trick (filenames are hardcoded, look at top of file):
### shuffle.py import random import shelve # Open external files needed for data storage. lines = open("test.dat","r") lineindex = shelve.open("test.idx") newlines = open("test.new.dat","w") # Create an index of all lines of the file in an external flat file DB. # This means that nothing actually remains in memory, but in an extremely # efficient (g)dbm flatfile DB. def makeIdx(): i = 0L lastpos = 0L curpos = None while lines.readline(): # This is after the (\r)\n, which will be stripped() and rewritten # by writeNewLines(). curpos = long(lines.tell()) lineindex[hex(i)[2:-1]] = "%s:%s" % (hex(lastpos)[2:-1], hex(curpos-lastpos)[2:-1]) lastpos = curpos i += 1 return i maxidx = makeIdx() # To shuffle the file, just shuffle the index. Problem being: there is no # random number generator which even remotely has the possibility of yielding # all possible permutations. Thus, for simplicity: just exchange every element # in order 1..end with a random element from the rest of the file. This is # certainly no perfect shuffle, and in case the shuffling is too bad, just # rerun shuffleIdx() a couple of times. def shuffleIdx(): oldi = 0L # Use a while loop, as xrange doesn't work with longs. while oldi < maxidx: oi = hex(oldi)[2:-1] while True: ni = hex(long(random.randrange(maxidx)))[2:-1] if ni <> oi: break lineindex[oi], lineindex[ni] = lineindex[ni], lineindex[oi] oldi += 1 shuffleIdx() # Write out the shuffled file. Do this by just walking the index 0..end. def writeNewLines(): i = 0L # Use a while loop, as xrange doesn't work with longs. while i < maxidx: # Extract line index and line length from the index file. lidx, llen = [long(x,16) for x in lineindex[hex(i)[2:-1]].split(":")] lines.seek(lidx) line = lines.read(llen).strip() newlines.write(line+"\n") i += 1 writeNewLines() ### End shuffle.py I don't know how fast this program will run, but at least, it does as told... ;) -- --- Heiko.
pgpeVbq0wufOV.pgp
Description: PGP signature
-- http://mail.python.org/mailman/listinfo/python-list