Changeset: 680709d968ab for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=680709d968ab
Modified Files:
        gdk/gdk_bloom.c
        gdk/gdk_join.c
        gdk/gdk_private.h
Branch: leftmart
Log Message:

build a bloom filter on the right side and use it during HASHJOIN


diffs (245 lines):

diff --git a/gdk/gdk_bloom.c b/gdk/gdk_bloom.c
--- a/gdk/gdk_bloom.c
+++ b/gdk/gdk_bloom.c
@@ -65,21 +65,40 @@ BLOOMsize(BUN cnt) {
        return m;
 }
 
+static void bloom_stats(BAT b, Bloomfilter *bloom) {
+       BUN on = 0, off=0;
+       size_t i, j;
+       unsigned char *filter = (unsigned char *) bloom->filter->base;
+
+
+       for (i=0; i < bloom->bytes; i++)
+               for (j=0;j<8;j++)
+                       if (filter[i] & (1 << j))
+                               on++;
+                       else
+                               off++;
+       fprintf(stderr, "#BATbloom(b=%s#" BUNFMT ") %s: bits on = " BUNFMT ", 
bits off = " BUNFMT "\n",
+               BATgetId(b), BATcount(b), b->T->heap.filename on, off);
+}
+
 #define BLOOM_BUILD(TYPE)                                                      
\
 do {                                                                           
\
        const TYPE *restrict col = (TYPE *) Tloc(b, b->batFirst);               
\
        BUN p;                                                                  
\
-       BUN key,hv,x,y,z;                                                       
\
+       BUN key,mv,hv,x,y,z;                                                    
\
        for (p=0; p<cnt; p++) {                                                 
\
                key = (BUN) col[p];                                             
\
                hash_init(key, x,y,z);                                          
\
                next_hash(hv, x,y,z);                                           
\
-               filter[quotient8(hv)] |= (1 << remainder8(hv));                 
\
+               mv = modulor(hv,bloom->mask);                                   
\
+               filter[quotient8(mv)] |= (1 << remainder8(mv));                 
\
                next_hash(hv, x,y,z);                                           
\
-               filter[quotient8(hv)] |= (1 << remainder8(hv));                 
\
+               mv = modulor(hv,bloom->mask);                                   
\
+               filter[quotient8(mv)] |= (1 << remainder8(mv));                 
\
                if (bloom->kfunc == 3) {                                        
\
                        next_hash(hv, x,y,z);                                   
\
-                       filter[quotient8(hv)] |= (1 << remainder8(hv));         
\
+                       mv = modulor(hv,bloom->mask);                           
\
+                       filter[quotient8(mv)] |= (1 << remainder8(mv));         
\
                }                                                               
\
        }                                                                       
\
 } while (0)
@@ -105,7 +124,7 @@ BATbloom(BAT *b)
                break;
        default:                /* type not supported */
                /* doesn't look enough like base type: do nothing */
-               GDKerror("BATbloom: unsupported type\n");
+               /* GDKerror("BATbloom: unsupported type\n"); */
                return GDK_FAIL;
        }
 
@@ -120,9 +139,9 @@ BATbloom(BAT *b)
 
        if (b->T->bloom == NULL) {
                BUN cnt;
-               size_t bytes;
                Bloomfilter *bloom;
-               char *filter;
+               unsigned char *filter;
+               size_t i;
 
                bloom = (Bloomfilter *) GDKzalloc(sizeof(Bloomfilter));
                if (bloom == NULL) {
@@ -144,16 +163,16 @@ BATbloom(BAT *b)
                     (ATOMbasetype(b->T->type) == TYPE_sht && (bloom->mbits = 
(1 << 16))) ) {
                        bloom->kfunc = 1;
                        bloom->mask = bloom->mbits-1;
-                       bytes = quotient8(bloom->mbits);
+                       bloom->bytes = quotient8(bloom->mbits);
                } else {
                        bloom->mbits = BLOOMsize(cnt);
                        /* 2 functions if the ratio is close to 3, 3 otherwise 
*/
                        bloom->kfunc = bloom->mbits/cnt == 3 ? 2 : 3;
                        bloom->mask = bloom->mbits-1;
-                       bytes = quotient8(bloom->mbits) + 1;
+                       bloom->bytes = quotient8(bloom->mbits) + 1;
                }
 
-               if (HEAPalloc(bloom->filter, bytes, 1) != GDK_SUCCEED) {
+               if (HEAPalloc(bloom->filter, bloom->bytes, 1) != GDK_SUCCEED) {
                        GDKerror("#BATbloom: memory allocation error");
                        GDKfree(bloom->filter);
                        GDKfree(bloom);
@@ -163,13 +182,16 @@ BATbloom(BAT *b)
 
                ALGODEBUG fprintf(stderr, "#BATbloom(b=%s#" BUNFMT ") %s: "
                                "create bloom filter: mbits = " BUNFMT ", ratio 
= " BUNFMT
-                               "kfunc = %d, bytes = " BUNFMT "\n", BATgetId(b),
+                               ", kfunc = %d, bytes = " BUNFMT "\n", 
BATgetId(b),
                                BATcount(b), b->T->heap.filename,
                                bloom->mbits,  bloom->mbits/BATcount(b),
-                               bloom->kfunc, quotient8(bloom->mbits));
+                               bloom->kfunc, bloom->bytes);
 
                filter = (unsigned char *) bloom->filter->base;
 
+               for (i=0; i < bloom->bytes; i++)
+                       filter[i] = 0;
+
                switch (ATOMbasetype(b->T->type)) {
                case TYPE_bte:
                {
@@ -209,19 +231,44 @@ BATbloom(BAT *b)
 
        t1 = GDKusec();
 
-       ALGODEBUG fprintf(stderr, "#BATbloom: bloom construction " LLFMT " 
usec\n", t1 - t0);
-       //print_stats(b, t1-t0);
-
        MT_lock_unset(&GDKhashLock(abs(b->batCacheid)));
 
-       assert(b->batCapacity >= BATcount(b));
+       ALGODEBUG fprintf(stderr, "#BATbloom: bloom construction " LLFMT " 
usec\n", t1 - t0);
+       ALGODEBUG bloom_stats(b, b->T->bloom);
+
        return GDK_SUCCEED;
 }
 
+inline
+int BLOOMask(BUN v, Bloomfilter *bloom)
+{
+       BUN hv,mv,x,y,z;
+       int ret = 0;
 
+       const unsigned char *filter = (unsigned char *) bloom->filter->base;
+       if (bloom->kfunc == 1) {
+               return filter[quotient8(v)] & (1 << remainder8(v));
+       }
+
+       hash_init(v, x,y,z);
+       next_hash(hv, x,y,z);
+       mv = modulor(hv, bloom->mask);
+       ret = filter[quotient8(mv)] & (1 << remainder8(mv));
+       if (ret) {
+               next_hash(hv, x,y,z);
+               mv = modulor(hv, bloom->mask);
+               ret = (filter[quotient8(mv)] & (1 << remainder8(mv)));
+               if (bloom->kfunc == 3 && ret) {
+                       next_hash(hv, x,y,z);
+                       mv = modulor(hv, bloom->mask);
+                       ret = (filter[quotient8(mv)] & (1 << remainder8(mv)));
+               }
+       }
+
+       return ret;
+}
 
 #if 0
-
 static void
 BLOOMremove(BAT *b)
 {
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -2606,13 +2606,18 @@ binsearchcand(const oid *cand, BUN lo, B
 #define HASHJOIN(TYPE, WIDTH)                                          \
        do {                                                            \
                BUN hashnil = HASHnil(hsh);                             \
+               BUN yes = 0, no = 0, false_positive = 0;                \
                for (lo = lstart - BUNfirst(l) + l->hseqbase;           \
                     lstart < lend;                                     \
                     lo++) {                                            \
+                       int ask;                                        \
                        v = FVALUE(l, lstart);                          \
                        lstart++;                                       \
                        nr = 0;                                         \
-                       if (*(const TYPE*)v != TYPE##_nil) {            \
+                       ask = BLOOMask((BUN) (*(TYPE*)v), r->T->bloom); \
+                       no++;                                           \
+                       if (*(const TYPE*)v != TYPE##_nil && ask) {     \
+                               yes++; no--;                            \
                                for (rb = HASHget##WIDTH(hsh, hash_##TYPE(hsh, 
v)); \
                                     rb != hashnil;                     \
                                     rb = HASHgetlink##WIDTH(hsh, rb))  \
@@ -2623,6 +2628,7 @@ binsearchcand(const oid *cand, BUN lo, B
                                        }                               \
                        }                                               \
                        if (nr == 0) {                                  \
+                               if (ask) false_positive++;              \
                                lskipped = BATcount(r1) > 0;            \
                        } else {                                        \
                                if (lskipped) {                         \
@@ -2636,6 +2642,11 @@ binsearchcand(const oid *cand, BUN lo, B
                                        r1->trevsorted = 0;             \
                        }                                               \
                }                                                       \
+               fprintf(stderr,"#BATbloom(b=%s#" BUNFMT ") %s: "        \
+                               "ask bloom filter: yes = " BUNFMT ", no = " 
BUNFMT \
+                               ", probes = " BUNFMT ", false positives = " 
BUNFMT "\n", \
+                               BATgetId(r), BATcount(r), r->T->heap.filename, \
+                               yes, no, lo, false_positive );          \
        } while (0)
 
 static gdk_return
@@ -2767,6 +2778,11 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT *
        hsh = r->T->hash;
        t = ATOMbasetype(r->ttype);
 
+       /* check for bloom filter on right */
+       if (!BATcheckbloom(r)) {
+               BATbloom(r);
+       }
+
        if (lcand == NULL && rcand == NULL && lvars == NULL &&
            !nil_matches && !nil_on_miss && !semi && !only_misses &&
            l->ttype != TYPE_void && (t == TYPE_int || t == TYPE_lng)) {
@@ -3974,8 +3990,8 @@ BATjoin(BAT **r1p, BAT **r2p, BAT *l, BA
                /* both sorted */
                return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0, 
maxsize, t0, 0);
        } else if (lhash && rhash) {
-               /* both have hash, smallest on right */ /* TODO: swap = lcount 
> rcount (small side left)*/
-               swap = lcount < rcount;
+               /* both have hash, smallest on left (TODO) */
+               swap = lcount > rcount;
                reason = "both have hash";
        } else if (lhash) {
                /* only left has hash, swap */
diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -63,6 +63,9 @@ struct BATstore {
        __attribute__((__visibility__("hidden")));
 __hidden int BATcheckbloom(BAT *b)
        __attribute__((__visibility__("hidden")));
+__hidden int BLOOMask(BUN v, Bloomfilter *bloom)
+       __attribute__((__visibility__("hidden")));
+
 __hidden BATstore *BATcreatedesc(int tt, int heapnames, int role)
        __attribute__((__visibility__("hidden")));
 __hidden void BATdelete(BAT *b)
@@ -248,6 +251,7 @@ struct Bloomfilter {
        BUN mbits;
        int kfunc;
        lng mask;
+       size_t bytes;
        Heap *filter;
 };
 
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to