Changeset: 680709d968ab for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=680709d968ab Modified Files: gdk/gdk_bloom.c gdk/gdk_join.c gdk/gdk_private.h Branch: leftmart Log Message:
build a bloom filter on the right side and use it during HASHJOIN diffs (245 lines): diff --git a/gdk/gdk_bloom.c b/gdk/gdk_bloom.c --- a/gdk/gdk_bloom.c +++ b/gdk/gdk_bloom.c @@ -65,21 +65,40 @@ BLOOMsize(BUN cnt) { return m; } +static void bloom_stats(BAT b, Bloomfilter *bloom) { + BUN on = 0, off=0; + size_t i, j; + unsigned char *filter = (unsigned char *) bloom->filter->base; + + + for (i=0; i < bloom->bytes; i++) + for (j=0;j<8;j++) + if (filter[i] & (1 << j)) + on++; + else + off++; + fprintf(stderr, "#BATbloom(b=%s#" BUNFMT ") %s: bits on = " BUNFMT ", bits off = " BUNFMT "\n", + BATgetId(b), BATcount(b), b->T->heap.filename on, off); +} + #define BLOOM_BUILD(TYPE) \ do { \ const TYPE *restrict col = (TYPE *) Tloc(b, b->batFirst); \ BUN p; \ - BUN key,hv,x,y,z; \ + BUN key,mv,hv,x,y,z; \ for (p=0; p<cnt; p++) { \ key = (BUN) col[p]; \ hash_init(key, x,y,z); \ next_hash(hv, x,y,z); \ - filter[quotient8(hv)] |= (1 << remainder8(hv)); \ + mv = modulor(hv,bloom->mask); \ + filter[quotient8(mv)] |= (1 << remainder8(mv)); \ next_hash(hv, x,y,z); \ - filter[quotient8(hv)] |= (1 << remainder8(hv)); \ + mv = modulor(hv,bloom->mask); \ + filter[quotient8(mv)] |= (1 << remainder8(mv)); \ if (bloom->kfunc == 3) { \ next_hash(hv, x,y,z); \ - filter[quotient8(hv)] |= (1 << remainder8(hv)); \ + mv = modulor(hv,bloom->mask); \ + filter[quotient8(mv)] |= (1 << remainder8(mv)); \ } \ } \ } while (0) @@ -105,7 +124,7 @@ BATbloom(BAT *b) break; default: /* type not supported */ /* doesn't look enough like base type: do nothing */ - GDKerror("BATbloom: unsupported type\n"); + /* GDKerror("BATbloom: unsupported type\n"); */ return GDK_FAIL; } @@ -120,9 +139,9 @@ BATbloom(BAT *b) if (b->T->bloom == NULL) { BUN cnt; - size_t bytes; Bloomfilter *bloom; - char *filter; + unsigned char *filter; + size_t i; bloom = (Bloomfilter *) GDKzalloc(sizeof(Bloomfilter)); if (bloom == NULL) { @@ -144,16 +163,16 @@ BATbloom(BAT *b) (ATOMbasetype(b->T->type) == TYPE_sht && (bloom->mbits = (1 << 16))) ) { bloom->kfunc = 1; bloom->mask = bloom->mbits-1; - bytes = quotient8(bloom->mbits); + bloom->bytes = quotient8(bloom->mbits); } else { bloom->mbits = BLOOMsize(cnt); /* 2 functions if the ratio is close to 3, 3 otherwise */ bloom->kfunc = bloom->mbits/cnt == 3 ? 2 : 3; bloom->mask = bloom->mbits-1; - bytes = quotient8(bloom->mbits) + 1; + bloom->bytes = quotient8(bloom->mbits) + 1; } - if (HEAPalloc(bloom->filter, bytes, 1) != GDK_SUCCEED) { + if (HEAPalloc(bloom->filter, bloom->bytes, 1) != GDK_SUCCEED) { GDKerror("#BATbloom: memory allocation error"); GDKfree(bloom->filter); GDKfree(bloom); @@ -163,13 +182,16 @@ BATbloom(BAT *b) ALGODEBUG fprintf(stderr, "#BATbloom(b=%s#" BUNFMT ") %s: " "create bloom filter: mbits = " BUNFMT ", ratio = " BUNFMT - "kfunc = %d, bytes = " BUNFMT "\n", BATgetId(b), + ", kfunc = %d, bytes = " BUNFMT "\n", BATgetId(b), BATcount(b), b->T->heap.filename, bloom->mbits, bloom->mbits/BATcount(b), - bloom->kfunc, quotient8(bloom->mbits)); + bloom->kfunc, bloom->bytes); filter = (unsigned char *) bloom->filter->base; + for (i=0; i < bloom->bytes; i++) + filter[i] = 0; + switch (ATOMbasetype(b->T->type)) { case TYPE_bte: { @@ -209,19 +231,44 @@ BATbloom(BAT *b) t1 = GDKusec(); - ALGODEBUG fprintf(stderr, "#BATbloom: bloom construction " LLFMT " usec\n", t1 - t0); - //print_stats(b, t1-t0); - MT_lock_unset(&GDKhashLock(abs(b->batCacheid))); - assert(b->batCapacity >= BATcount(b)); + ALGODEBUG fprintf(stderr, "#BATbloom: bloom construction " LLFMT " usec\n", t1 - t0); + ALGODEBUG bloom_stats(b, b->T->bloom); + return GDK_SUCCEED; } +inline +int BLOOMask(BUN v, Bloomfilter *bloom) +{ + BUN hv,mv,x,y,z; + int ret = 0; + const unsigned char *filter = (unsigned char *) bloom->filter->base; + if (bloom->kfunc == 1) { + return filter[quotient8(v)] & (1 << remainder8(v)); + } + + hash_init(v, x,y,z); + next_hash(hv, x,y,z); + mv = modulor(hv, bloom->mask); + ret = filter[quotient8(mv)] & (1 << remainder8(mv)); + if (ret) { + next_hash(hv, x,y,z); + mv = modulor(hv, bloom->mask); + ret = (filter[quotient8(mv)] & (1 << remainder8(mv))); + if (bloom->kfunc == 3 && ret) { + next_hash(hv, x,y,z); + mv = modulor(hv, bloom->mask); + ret = (filter[quotient8(mv)] & (1 << remainder8(mv))); + } + } + + return ret; +} #if 0 - static void BLOOMremove(BAT *b) { diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c --- a/gdk/gdk_join.c +++ b/gdk/gdk_join.c @@ -2606,13 +2606,18 @@ binsearchcand(const oid *cand, BUN lo, B #define HASHJOIN(TYPE, WIDTH) \ do { \ BUN hashnil = HASHnil(hsh); \ + BUN yes = 0, no = 0, false_positive = 0; \ for (lo = lstart - BUNfirst(l) + l->hseqbase; \ lstart < lend; \ lo++) { \ + int ask; \ v = FVALUE(l, lstart); \ lstart++; \ nr = 0; \ - if (*(const TYPE*)v != TYPE##_nil) { \ + ask = BLOOMask((BUN) (*(TYPE*)v), r->T->bloom); \ + no++; \ + if (*(const TYPE*)v != TYPE##_nil && ask) { \ + yes++; no--; \ for (rb = HASHget##WIDTH(hsh, hash_##TYPE(hsh, v)); \ rb != hashnil; \ rb = HASHgetlink##WIDTH(hsh, rb)) \ @@ -2623,6 +2628,7 @@ binsearchcand(const oid *cand, BUN lo, B } \ } \ if (nr == 0) { \ + if (ask) false_positive++; \ lskipped = BATcount(r1) > 0; \ } else { \ if (lskipped) { \ @@ -2636,6 +2642,11 @@ binsearchcand(const oid *cand, BUN lo, B r1->trevsorted = 0; \ } \ } \ + fprintf(stderr,"#BATbloom(b=%s#" BUNFMT ") %s: " \ + "ask bloom filter: yes = " BUNFMT ", no = " BUNFMT \ + ", probes = " BUNFMT ", false positives = " BUNFMT "\n", \ + BATgetId(r), BATcount(r), r->T->heap.filename, \ + yes, no, lo, false_positive ); \ } while (0) static gdk_return @@ -2767,6 +2778,11 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT * hsh = r->T->hash; t = ATOMbasetype(r->ttype); + /* check for bloom filter on right */ + if (!BATcheckbloom(r)) { + BATbloom(r); + } + if (lcand == NULL && rcand == NULL && lvars == NULL && !nil_matches && !nil_on_miss && !semi && !only_misses && l->ttype != TYPE_void && (t == TYPE_int || t == TYPE_lng)) { @@ -3974,8 +3990,8 @@ BATjoin(BAT **r1p, BAT **r2p, BAT *l, BA /* both sorted */ return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0, maxsize, t0, 0); } else if (lhash && rhash) { - /* both have hash, smallest on right */ /* TODO: swap = lcount > rcount (small side left)*/ - swap = lcount < rcount; + /* both have hash, smallest on left (TODO) */ + swap = lcount > rcount; reason = "both have hash"; } else if (lhash) { /* only left has hash, swap */ diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h --- a/gdk/gdk_private.h +++ b/gdk/gdk_private.h @@ -63,6 +63,9 @@ struct BATstore { __attribute__((__visibility__("hidden"))); __hidden int BATcheckbloom(BAT *b) __attribute__((__visibility__("hidden"))); +__hidden int BLOOMask(BUN v, Bloomfilter *bloom) + __attribute__((__visibility__("hidden"))); + __hidden BATstore *BATcreatedesc(int tt, int heapnames, int role) __attribute__((__visibility__("hidden"))); __hidden void BATdelete(BAT *b) @@ -248,6 +251,7 @@ struct Bloomfilter { BUN mbits; int kfunc; lng mask; + size_t bytes; Heap *filter; }; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list