Re: nonuniform sampling with replacement

Alf P. Steinbach Sun, 21 Mar 2010 05:08:04 -0700

* Jah_Alarm:

I've got a vector length n of integers (some of them are repeating),
and I got a selection probability vector of the same length. How will
I sample with replacement k (<=n) values with the probabilty vector.
In Matlab this function is randsample. I couldn't find anything to
this extent in Scipy or Numpy.


<code>
#Py3
import operator     # itemgetter
import random
from collections import defaultdict

def normalized_to_sum( s, v ):
    current_s = sum( v )
    c = s/current_s
    return [c*x for x in v]

class ValueSampler:
    def __init__( self, values, probabilities ):
        assert len( values ) == len( probabilities )
        get2nd = operator.itemgetter( 1 )
        v_p = sorted( zip( values, probabilities ), key = get2nd, reverse = 
True )
        v_ap = [];  sum = 0;
        for (v, p) in v_p:
            v_ap.append( (v, p + sum) );
            sum += p
        self._data = v_ap

    def __call__( self, p ):
        return self.choice( p )

    def choice( self, p ):
        data = self._data; i_v = 0; i_p = 1;
        assert 0 <= p <= 1
        assert len( data ) > 0, "Sampler: Sampling from empty value set"
        low = 0; high = len( data ) - 1;
        if p > data[high][i_p]: return data[high][i_p]  # Float values 
workaround.
        while low != high:
            mid = (low + high)//2
            if p > data[mid][i_p]:
                low = mid + 1
            else:
                high = mid
        return data[low][i_v]


def main():
    v = [3, 1, 4, 1, 5, 9, 2, 6, 5, 4];
    p = normalized_to_sum( 1, [2, 7, 1, 8, 2, 8, 1, 8, 2, 8] )
    sampler = ValueSampler( v, p )

    probabilities = defaultdict( lambda: 0.0 )
    for (i, value) in enumerate( v ):
        probabilities[value] += p[i]
    print( probabilities )
    print()

    frequencies = defaultdict( lambda: 0.0 )
    n = 100000
    for i in range( n ):
        value = sampler( random.random() )
        frequencies[value] += 1/n
    print( frequencies )

main()
</code>


Cheers & hth.,

- Alf

Disclaimer: I just cooked it up and just cooked up binary searches usually havebugs. They usually need to be exercised and fixed. But I think you get the idea.Note also that division differs in Py3 and Py2. This is coded for Py3.

--
http://mail.python.org/mailman/listinfo/python-list

Re: nonuniform sampling with replacement

Reply via email to