Changeset: f9d71464a676 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f9d71464a676
Modified Files:
        gdk/gdk_sample.c
Branch: stratified_sampling
Log Message:

fix reservoir sampling semantic bug


diffs (122 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -75,9 +75,9 @@
                keys[p2] = key;         \
        } while (0)
 
-#define compKeysGT(p1, p2)                                                     
                                                \
+#define compKeysGT(p1, p2)                                                     
\
        ((keys[p1] > keys[p2]) ||                                               
\
-       (keys[p1] == keys[p2] && oids[p1] > oids[p2]))
+       ((keys[p1] == keys[p2]) && (oids[p1] > oids[p2])))
 
 /* this is a straightforward implementation of a binary tree */
 struct oidtreenode {
@@ -243,12 +243,12 @@ BATsample(BAT *b, BUN n)
 /* based on Alg-A-exp from 'Weighted random sampling with a reservoir' by 
Efraimidis and Spirakis (2006) */
 BAT *
 BATweightedsample(BAT *b, BUN n, BAT *w)
-{
+{//TODO: does not create correct samples yet (so it seems! or is the output 
broken?)
        BAT* sample;
        oid* oids;//points to the oids in sample
        dbl* w_ptr;//TODO types of w
        dbl* keys;//keys as defined in Alg-A-exp
-       BUN cnt, i;
+       BUN cnt, i, j;
        mtwist *mt_rng;
        BUN pos, childpos;
        oid item;
@@ -266,7 +266,7 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
 
        cnt = BATcount(b);
 
-       keys = (double*) malloc(sizeof(double)*n);
+       keys = (dbl*) malloc(sizeof(dbl)*n);
        if(keys == NULL)
                return NULL;
 
@@ -276,8 +276,10 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
                return NULL;
        }
 
-       oids = (oid *)sample->theap.base;
-       w_ptr = (dbl*) w->theap.base;
+
+       //oids = (oid *)sample->theap.base;
+       oids = (oid *) Tloc(sample, 0);
+       w_ptr = (dbl*) Tloc(w, 0);//TODO is this the right way to get w_ptr?
 
        mt_rng = mtwist_new();
        mtwist_seed(mt_rng, rand());
@@ -285,30 +287,56 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
        BATsetcount(sample, n);
                /* obtain sample */
        //TODO: reservoir sampling with exponential jumps
-       for(i=0; i<n; i++) {
-               oids[i] = i+minoid;
-               keys[i] = pow(mtwist_drand(mt_rng),1.0/w_ptr[i]);//TODO cast 
1.0 to dbl?
+       //Initialize prioqueue
+       i=0;//i indices the initial sample (filled with elements with non-zero 
weight)
+               //j indices the oids and weights
+       for(j=0; i < n && j < cnt; j++) {
+               if(w_ptr[j] == 0.0)
+                       continue;
+               oids[i] = (oid)(j+minoid);
+               keys[i] = pow(mtwist_drand(mt_rng),1.0/w_ptr[j]);//TODO cast 
1.0 to dbl? NULL values in w_ptr
+               i++;
        }
+       if(i < n)//not enough non-zero weights
+               return NULL;
        heapify(compKeysGT, SWAP3);//NOTE: writes to 'i'
 
-       i=n;
+       fprintf(stderr,"\n\n ID | KEY   (INITIAL HEAP)\n");
+       fprintf(stderr,"----+----\n");
+       for(i=0; i<n; i++)
+               fprintf(stderr," %2d | %.2f \n", (int)(oids[i]-minoid), 
keys[i]);
+
        while(true) {
                r = mtwist_drand(mt_rng);
-               xw = log(r)/log(keys[oids[0]-minoid]);
-               while(xw > 0 && i < cnt) {
-                       xw -= w_ptr[i];
-                       i++;
+               xw = log(r)/log(keys[0]);
+               fprintf(stderr,"THIS SHOULD BE POSITIVE: xw = log(r = 
%f)/log(keys[0] = %f) = %f\n",r,keys[0],xw);
+               for(;j<cnt && xw > w_ptr[j]; j++) {
+                       xw -= w_ptr[j];
+                       fprintf(stderr,"j = %d, xw = %f (just subtracted 
%f)\n", (int)j, xw, w_ptr[j]);
                }
-               if(i >= cnt) break;
-               //At this point, xw - (w_ptr[c]+w_ptr[c+1]+...+w_ptr[i]) <= 0
-               tw = pow(keys[oids[0]-minoid], w_ptr[i]);
+               if(j >= cnt) break;
+               //At this point, w_ptr[c]+w_ptr[c+1]+...+w_ptr[i-1] < xw < 
w_ptr[c]+w_ptr[c+1]+...+w_ptr[i]
+               tw = pow(keys[0], w_ptr[j]);
                r2 = mtwist_drand(mt_rng)*(1-tw)+tw;
-               key = pow(r2, 1/w_ptr[i]);
+               key = pow(r2, 1/w_ptr[j]);
 
-               oids[0] = i+minoid;
+               //Replace element with lowest key in prioqueue
+               oids[0] = (oid)(j+minoid);
                keys[0] = key;
                siftup(compKeysGT, 0, SWAP3);//NOTE: writes to 'key'
+
+               fprintf(stderr,"\n\n ID | KEY   (<%d,%.2f> INSERTED)\n", 
(int)j, key);
+               fprintf(stderr,"----+----\n");
+               for(i=0; i<n; i++)
+                       fprintf(stderr," %2d | %.2f \n", (int)(oids[i]-minoid), 
keys[i]);
+
+               //We have added the jth element, can continue
+               j++;
        }
+       fprintf(stderr,"\n\n ID | KEY   (FINAL RESULT)\n");
+       fprintf(stderr,"----+----\n");
+       for(i=0; i<n; i++)
+               fprintf(stderr," %2d | %.2f \n", (int)(oids[i]-minoid), 
keys[i]);
 
        free(keys);
 
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to