Brian Quinlan wrote: > This is less a Python question and more a optimization/probability > question. Imaging that you have a list of objects and there frequency in > a population e.g. > > lst = [(a, 0.01), (b, 0.05), (c, 0.50), (d, 0.30), (e, 0.04), (f, 0.10)] > > and you want to drawn n items from that list (duplicates allowed), with > that probability distribution. > > The fastest algorithm that I have been able to devise for doing so is: > O(n * log(len(lst))). Can anyone think or a solution with a better time > complexity? If not, is there an obviously better way to do this > (assuming n is big and the list size is small). >
Any way I tried to slice and dice it, I could not get any faster. draw2 and draw 3 generate code on the fly. draw4 sneakily tries to trade memory and accuracy for speed but is even slower! First the times, then the code: $ ./timeit.py 'from probDistribution import draw as draw; draw(10000, [0.01, 0.05, 0.50, 0.30, 0.04, 0.10])' 100 loops, best of 3: 13.4 msec per loop $ ./timeit.py 'from probDistribution import draw2 as draw; draw(10000, [0.01, 0.05, 0.50, 0.30, 0.04, 0.10])' 100 loops, best of 3: 15.2 msec per loop $ ./timeit.py 'from probDistribution import draw3 as draw; draw(10000, [0.01, 0.05, 0.50, 0.30, 0.04, 0.10])' 100 loops, best of 3: 16.2 msec per loop $ ./timeit.py 'from probDistribution import draw4 as draw; draw(10000, [0.01, 0.05, 0.50, 0.30, 0.04, 0.10])' 10 loops, best of 3: 30.5 msec per loop === CODE probDistribution.py === from random import random, randrange from bisect import bisect def draw(n, lst): ps = [] last = 0 for p in lst: ps.append(last + p) last += p # ps = [0.01, 0.06, 0.56, 0.86, 0.90, 1.00] chosen = [0] * len(lst) # track frequency for i in range(n): r = random() chosen[bisect(ps, r)] += 1 # binary search and increment result = [] # rescale probability based on frequency for c in chosen: result.append(float(c) / n) return result def draw2(n, lst): """ uses dynamicc code generation of this form: chosen = [0] * 6 for i in xrange(10000): r = random() if r < 0.01: chosen[0]+=1 elif r < 0.06: chosen[1] +=1 ... elif r < 0.90: chosen[4] +=1 else chosen[5]+=1 """ assert len(lst)>1, "Corner case NOT covered" codestr = 'chosen = [0] * %i\n' % (len(lst),) codestr += 'for i in xrange(%i):\n r = random()\n' % (n,) last = 0.0 lstmax = len(lst)-1 for i,p in enumerate(lst): last += p if i==0: codestr += ' if r < %g: chosen[%i] +=1\n' % (last, i) elif i==lstmax: codestr += ' else: chosen[%i] +=1\n' % (i,) else: codestr += ' elif r < %g: chosen[%i] +=1\n' % (last, i) exec codestr result = [] # rescale probability based on frequency for c in chosen: result.append(float(c) / n) return result def draw3(n, lst): """ uses dynamicc code generation of this form: chosen = [0] * 6 for i in xrange(10000): r = random() chosen[-1+ ( ((r<0.01) and 1) or ((r<0.06) and 2) ... or ((r<0.90) and 5) or 6 )] +=1 """ assert len(lst)>1, "Corner case NOT covered" codestr = 'chosen = [0] * %i\n' % (len(lst),) codestr += 'for i in xrange(%i):\n r = random()\n' % (n,) codestr += ' chosen[-1+ (\n' last = 0.0 lstmax = len(lst)-1 for i,p in enumerate(lst): last += p if i==0: codestr += ' ((r<%g) and 1)\n' % (last) elif i==lstmax: codestr += ' or %i\n )] +=1\n' % (i+1,) else: codestr += ' or ((r<%g) and %i)\n' % (last, i+1) #return codestr exec codestr result = [] # rescale probability based on frequency for c in chosen: result.append(float(c) / n) return result def draw4(n, lst, precision = 0.01, maxbins=10000): """ Memory/speed tradeoff by coarse quantizing of frequency values. """ assert len(lst)>1, "Corner case NOT covered" assert 0.0 < precision < 1.0 and (1.0/precision) < maxbins binmax = int(1.0/precision) chosenbin = [0] * binmax for i in xrange(n): chosenbin[randrange(binmax)] +=1 left, right = 0, 0 # extract bin range for summation chosen = [0] * len(lst) last = 0.0 for i,p in enumerate(lst): last += p right = int(last/precision) chosen[i] = sum( chosenbin[left:right] ) left = right result = [] # rescale probability based on frequency for c in chosen: result.append(float(c) / n) return result === END CODE === -- http://mail.python.org/mailman/listinfo/python-list