Changeset: 354674570bd5 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=354674570bd5 Modified Files: gdk/gdk.h gdk/gdk_sample.c Branch: default Log Message:
BATsample_ implements BATsample but expecting void head and produce oid's of sampled BUNs. diffs (89 lines): diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -3165,6 +3165,7 @@ gdk_export BAT *BATintersectcand(BAT *a, * */ gdk_export BAT *BATsample(BAT *b, BUN n); +gdk_export BAT *BATsample_(BAT *b, BUN n); /* version that expects void head and returns oids */ /* generic n-ary multijoin beast, with defines to interpret retval */ #define MULTIJOIN_SORTED(r) ((char*) &r)[0] diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c --- a/gdk/gdk_sample.c +++ b/gdk/gdk_sample.c @@ -114,3 +114,74 @@ bunins_failed: BBPreclaim(bn); return NULL; } + +/* BATsample_ implements sampling for void headed BATs */ +BAT * +BATsample_(BAT *b, BUN n) +{ + BAT *bn; + BUN cnt; + + BATcheck(b, "BATsample"); + assert(BAThdense(b)); + ERRORcheck(n > BUN_MAX, "BATsample: sample size larger than BUN_MAX\n"); + ALGODEBUG fprintf(stderr, "#BATsample: sample " BUNFMT " elements.\n", n); + + cnt = BATcount(b); + /* empty sample size */ + if (n == 0) { + bn = BATnew(TYPE_void, TYPE_void, 0); + BATsetcount(bn, 0); + BATseqbase(bn, 0); + BATseqbase(BATmirror(bn), 0); + /* sample size is larger than the input BAT, return all oids */ + } else if (cnt <= n) { + bn = BATnew(TYPE_void, TYPE_void, cnt); + BATsetcount(bn, cnt); + BATseqbase(bn, 0); + BATseqbase(BATmirror(bn), b->H->seq); + } else { + BUN smp = 0; + /* we use wrd and not BUN since p may be -1 */ + wrd top = b->hseqbase + cnt - n; + wrd p = ((wrd) b->hseqbase) - 1; + oid *o; + bn = BATnew(TYPE_void, TYPE_oid, smp); + if (bn == NULL) { + GDKerror("#BATsample: memory allocation error"); + return NULL; + } + o = (oid *) Tloc(bn, BUNfirst(bn)); + while (smp < n-1) { /* loop until all but 1 values are sampled */ + double v = DRAND; + double quot = (double)top/(double)cnt; + BUN jump = 0; + while (quot > v) { /* determine how many positions to jump */ + jump++; + top--; + cnt--; + quot *= (double)top/(double)cnt; + } + p += (jump+1); + cnt--; + o[smp++] = (oid) p; + } + /* 1 left */ + p += (BUN) rand() % cnt; + o[smp] = (oid) p; + + /* property management */ + BATsetcount(bn, n); + bn->trevsorted = bn->U->count <= 1; + bn->tkey = 1; + bn->tdense = bn->U->count <= 1; + if (bn->U->count == 1) + bn->tseqbase = * (oid *) Tloc(bn, BUNfirst(bn)); + bn->hdense = 1; + bn->hseqbase = 0; + bn->hkey = 1; + bn->hrevsorted = bn->U->count <= 1; + } + + return bn; +} _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list