Changeset: 1f99f08168c2 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=1f99f08168c2
Modified Files:
        clients/Tests/exports.stable.out
        gdk/gdk.h
        gdk/gdk_bat.c
        gdk/gdk_batop.c
        gdk/gdk_hash.c
        gdk/gdk_hash.h
        gdk/gdk_join.c
        gdk/gdk_select.c
        monetdb5/modules/kernel/bat5.c
        monetdb5/modules/mal/tokenizer.c
Branch: linear-hashing
Log Message:

Implemented linear hashing for the BAT hash tables.


diffs (truncated from 1007 to 300 lines):

diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -294,6 +294,7 @@ const char *GDKversion(void);
 size_t GDKvm_cursize(void);
 void *GDKzalloc(size_t size) __attribute__((__malloc__)) 
__attribute__((__alloc_size__(1))) __attribute__((__warn_unused_result__));
 void HASHdestroy(BAT *b);
+gdk_return HASHgrowbucket(BAT *b);
 BUN HASHlist(Hash *h, BUN i);
 BUN HASHprobe(const Hash *h, const void *v);
 void HEAP_free(Heap *heap, var_t block);
diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -2625,7 +2625,7 @@ gdk_export void VIEWbounds(BAT *b, BAT *
 enum prop_t {
        GDK_MIN_VALUE = 3,      /* smallest non-nil value in BAT */
        GDK_MAX_VALUE,          /* largest non-nil value in BAT */
-       GDK_HASH_MASK,          /* last used hash mask */
+       GDK_HASH_BUCKETS,       /* last used hash bucket size */
 };
 
 /*
diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c
--- a/gdk/gdk_bat.c
+++ b/gdk/gdk_bat.c
@@ -452,9 +452,6 @@ BATextend(BAT *b, BUN newcap)
        if (b->theap.base &&
            HEAPextend(&b->theap, theap_size, b->batRestricted == BAT_READ) != 
GDK_SUCCEED)
                return GDK_FAIL;
-       HASHdestroy(b);
-       IMPSdestroy(b);
-       OIDXdestroy(b);
        return GDK_SUCCEED;
 }
 
@@ -1082,7 +1079,7 @@ BUNappend(BAT *b, const void *t, bool fo
                for (prop = b->tprops; prop; prop = prop->next)
                        if (prop->id != GDK_MAX_VALUE &&
                            prop->id != GDK_MIN_VALUE &&
-                           prop->id != GDK_HASH_MASK) {
+                           prop->id != GDK_HASH_BUCKETS) {
                                BATrmprop(b, prop->id);
                                break;
                        }
@@ -1164,7 +1161,7 @@ BUNdelete(BAT *b, oid o)
                for (prop = b->tprops; prop; prop = prop->next)
                        if (prop->id != GDK_MAX_VALUE &&
                            prop->id != GDK_MIN_VALUE &&
-                           prop->id != GDK_HASH_MASK) {
+                           prop->id != GDK_HASH_BUCKETS) {
                                BATrmprop(b, prop->id);
                                break;
                        }
@@ -1252,7 +1249,7 @@ BUNinplace(BAT *b, BUN p, const void *t,
                        for (prop = b->tprops; prop; prop = prop->next)
                                if (prop->id != GDK_MAX_VALUE &&
                                    prop->id != GDK_MIN_VALUE &&
-                                   prop->id != GDK_HASH_MASK) {
+                                   prop->id != GDK_HASH_BUCKETS) {
                                        BATrmprop(b, prop->id);
                                        break;
                                }
@@ -2400,13 +2397,13 @@ BATassertProps(BAT *b)
                                        seenmin |= cmp == 0;
                                }
                                prb = HASHprobe(hs, valp);
-                               for (hb = HASHget(hs,prb);
+                               for (hb = HASHget(hs, prb);
                                     hb != HASHnil(hs);
-                                    hb = HASHgetlink(hs,hb))
+                                    hb = HASHgetlink(hs, hb))
                                        if (cmpf(valp, BUNtail(bi, hb)) == 0)
                                                assert(!b->tkey);
-                               HASHputlink(hs,p, HASHget(hs,prb));
-                               HASHput(hs,prb,p);
+                               HASHputlink(hs, p, HASHget(hs, prb));
+                               HASHput(hs, prb, p);
                                assert(!b->tnonil || !isnil);
                                seennil |= isnil;
                        }
diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c
--- a/gdk/gdk_batop.c
+++ b/gdk/gdk_batop.c
@@ -561,19 +561,14 @@ BATappend(BAT *b, BAT *n, BAT *s, bool f
                for (prop = b->tprops; prop; prop = prop->next)
                        if (prop->id != GDK_MAX_VALUE &&
                            prop->id != GDK_MIN_VALUE &&
-                           prop->id != GDK_HASH_MASK) {
+                           prop->id != GDK_HASH_BUCKETS) {
                                BATrmprop(b, prop->id);
                                break;
                        }
        } while (prop);
 #endif
-       if (b->thash == (Hash *) 1 || BATcount(b) == 0 ||
-           (b->thash && ((size_t *) b->thash->heapbckt.base)[0] & (1 << 24))) {
-               /* don't bother first loading the hash to then change
-                * it, or updating the hash if we replace the heap,
-                * also, we cannot maintain persistent hashes */
-               HASHdestroy(b);
-       }
+       /* load hash so that we can maintain it */
+       (void) BATcheckhash(b);
 
        if (b->ttype == TYPE_void) {
                /* b does not have storage, keep it that way if we can */
@@ -615,7 +610,7 @@ BATappend(BAT *b, BAT *n, BAT *s, bool f
        MT_lock_set(&b->batIdxLock);
        if (b->thash == (Hash *) 1 ||
            (b->thash != NULL &&
-            (2 * b->thash->mask) < (BATcount(b) + cnt))) {
+            (2 * NHASHBUCKETS(b->thash)) < (BATcount(b) + cnt))) {
                MT_lock_unset(&b->batIdxLock);
                HASHdestroy(b);
        } else {
diff --git a/gdk/gdk_hash.c b/gdk/gdk_hash.c
--- a/gdk/gdk_hash.c
+++ b/gdk/gdk_hash.c
@@ -84,11 +84,39 @@ HASHclear(Hash *h)
         * rather than iteratively assigning individual
         * BUNi_NONE values in a for-loop
         */
-       memset(h->Hash, 0xFF, (h->mask + 1) * h->width);
+       memset(h->Bckt, 0xFF, NHASHBUCKETS(h) * h->width);
 }
 
-#define HASH_VERSION           2
-#define HASH_HEADER_SIZE       6 /* nr of size_t fields in header */
+#define HASH_VERSION           3
+#define HASH_HEADER_SIZE       8       /* nr of size_t fields in header */
+
+/* calculate (uint8_t) floor(log2(mask)) */
+static inline uint8_t
+HASHmasklevel(BUN mask)
+{
+       assert(mask != 0);
+
+#if SIZEOF_BUN == SIZEOF_INT && defined(__GNUC__)
+       return (uint8_t) (31 - __builtin_clz((unsigned int) mask));
+#elif SIZEOF_BUN == SIZEOF_INT && defined(_MSC_VER)
+       return (uint8_t) (31 - __lzcnt((unsigned int) mask));
+#elif SIZEOF_BUN == SIZEOF_LONG && defined(__GNUC__)
+       return (uint8_t) (63 - __builtin_clzl((unsigned long) mask));
+#elif SIZEOF_BUN == SIZEOF_LONG && defined(_MSC_VER)
+       return (uint8_t) (63 - __lzcnt64((unsigned __int64) mask));
+#else
+       uint8_t n = 0;
+       uint8_t c = SIZEOF_BUN * 4;
+       do {
+               BUN y = mask >> c;
+               if (y) {
+                       n += c;
+                       mask = y;
+               }
+       } while ((c >>= 1) != 0);
+       return n;
+#endif
+}
 
 gdk_return
 HASHnew(Hash *h, int tpe, BUN size, BUN mask, BUN count, bool bcktonly)
@@ -105,7 +133,8 @@ HASHnew(Hash *h, int tpe, BUN size, BUN 
        if (HEAPalloc(&h->heapbckt, mask + HASH_HEADER_SIZE * SIZEOF_SIZE_T / 
h->width, h->width) != GDK_SUCCEED)
                return GDK_FAIL;
        h->heapbckt.free = mask * h->width + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
-       h->mask = mask - 1;
+       h->level = HASHmasklevel(mask);
+       h->split = mask - ((BUN) 1 << h->level);
        switch (h->width) {
        case BUN2:
                h->nil = (BUN) BUN2_NONE;
@@ -121,19 +150,129 @@ HASHnew(Hash *h, int tpe, BUN size, BUN 
        default:
                assert(0);
        }
-       h->Hash = h->heapbckt.base + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
+       h->Bckt = h->heapbckt.base + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
        h->type = tpe;
        HASHclear(h);           /* zero the mask */
        ((size_t *) h->heapbckt.base)[0] = HASH_VERSION;
        ((size_t *) h->heapbckt.base)[1] = size;
-       ((size_t *) h->heapbckt.base)[2] = mask;
-       ((size_t *) h->heapbckt.base)[3] = h->width;
-       ((size_t *) h->heapbckt.base)[4] = count;
-       ((size_t *) h->heapbckt.base)[5] = 0; /* # filled slots (chain heads) */
+       ((size_t *) h->heapbckt.base)[2] = h->level;
+       ((size_t *) h->heapbckt.base)[3] = h->split;
+       ((size_t *) h->heapbckt.base)[4] = h->width;
+       ((size_t *) h->heapbckt.base)[5] = count;
+       ((size_t *) h->heapbckt.base)[6] = h->nunique;
+       ((size_t *) h->heapbckt.base)[7] = h->nheads;
        ACCELDEBUG fprintf(stderr, "#HASHnew: create hash(size " BUNFMT ", mask 
" BUNFMT ", width %d, total " BUNFMT " bytes);\n", size, mask, h->width, (size 
+ mask) * h->width);
        return GDK_SUCCEED;
 }
 
+gdk_return
+HASHgrowbucket(BAT *b)
+{
+       switch (ATOMsize(b->ttype)) {
+       case 1:
+       case 2:
+               /* no need to grow bucket list */
+               return GDK_SUCCEED;
+       default:
+               break;
+       }
+
+       Hash *h = b->thash;
+       BUN nbucket = NHASHBUCKETS(h);
+
+       if (h->nunique < nbucket * 3 / 4)
+               return GDK_SUCCEED;
+
+       BUN old = h->split;
+       BUN new = ((BUN) 1 << h->level) + h->split;
+       BATiter bi = bat_iterator(b);
+       BUN msk = (BUN) 1 << h->level;
+
+       ACCELDEBUG fprintf(stderr, "#%s: %s(" ALGOBATFMT ") -> " BUNFMT " 
buckets\n",
+                          MT_thread_getname(), __func__, ALGOBATPAR(b), 
nbucket + 1);
+       assert(h->heapbckt.free == nbucket * h->width + HASH_HEADER_SIZE * 
SIZEOF_SIZE_T);
+       h->heapbckt.dirty = true;
+       if (((size_t *) h->heapbckt.base)[0] & (size_t) 1 << 24) {
+               /* persistent hash: remove persistency */
+               ((size_t *) h->heapbckt.base)[0] &= ~((size_t) 1 << 24);
+               if (h->heapbckt.storage == STORE_MEM) {
+                       int fd;
+                       if ((fd = GDKfdlocate(h->heapbckt.farmid,
+                                             h->heapbckt.filename,
+                                             "rb+", NULL)) >= 0) {
+                               if (write(fd, h->heapbckt.base, SIZEOF_SIZE_T) 
>= 0) {
+                                       if (!(GDKdebug & NOSYNCMASK)) {
+#if defined(NATIVE_WIN32)
+                                               _commit(fd);
+#elif defined(HAVE_FDATASYNC)
+                                               fdatasync(fd);
+#elif defined(HAVE_FSYNC)
+                                               fsync(fd);
+#endif
+                                       }
+                               }
+                               close(fd);
+                       }
+               } else {
+                       if (!(GDKdebug & NOSYNCMASK))
+                               MT_msync(h->heapbckt.base, SIZEOF_SIZE_T);
+               }
+       }
+       if (h->heapbckt.free + h->width > h->heapbckt.size) {
+               if (HEAPextend(&h->heapbckt,
+                              h->heapbckt.size + GDK_mmap_pagesize,
+                              true) != GDK_SUCCEED) {
+                       return GDK_FAIL;
+               }
+               h->Bckt = h->heapbckt.base + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
+       }
+       assert(h->heapbckt.free + h->width <= h->heapbckt.size);
+       if (++h->split == ((BUN) 1 << h->level)) {
+               h->level++;
+               h->split = 0;
+       }
+       h->heapbckt.free += h->width;
+       BUN lold, lnew, hb;
+       lold = lnew = HASHnil(h);
+       if ((hb = HASHget(h, old)) != HASHnil(h)) {
+               h->nheads--;
+               do {
+                       const void *v = BUNtail(bi, hb);
+                       BUN hsh = ATOMhash(h->type, v);
+                       assert((hsh & (msk - 1)) == old);
+                       if (hsh & msk) {
+                               /* move to new list */
+                               if (lnew == HASHnil(h)) {
+                                       HASHput(h, new, hb);
+                                       h->nheads++;
+                               } else
+                                       HASHputlink(h, lnew, hb);
+                               lnew = hb;
+                       } else {
+                               /* move to old list */
+                               if (lold == HASHnil(h)) {
+                                       h->nheads++;
+                                       HASHput(h, old, hb);
+                               } else
+                                       HASHputlink(h, lold, hb);
+                               lold = hb;
+                       }
+                       hb = HASHgetlink(h, hb);
+               } while (hb != HASHnil(h));
+       }
+       if (lnew == HASHnil(h))
+               HASHput(h, new, HASHnil(h));
+       else
+               HASHputlink(h, lnew, HASHnil(h));
+       if (lold == HASHnil(h))
+               HASHput(h, old, HASHnil(h));
+       else
+               HASHputlink(h, lold, HASHnil(h));
+       BATsetprop_nolock(b, GDK_HASH_BUCKETS, TYPE_oid,
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to