Changeset: 1f99f08168c2 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=1f99f08168c2 Modified Files: clients/Tests/exports.stable.out gdk/gdk.h gdk/gdk_bat.c gdk/gdk_batop.c gdk/gdk_hash.c gdk/gdk_hash.h gdk/gdk_join.c gdk/gdk_select.c monetdb5/modules/kernel/bat5.c monetdb5/modules/mal/tokenizer.c Branch: linear-hashing Log Message:
Implemented linear hashing for the BAT hash tables. diffs (truncated from 1007 to 300 lines): diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out --- a/clients/Tests/exports.stable.out +++ b/clients/Tests/exports.stable.out @@ -294,6 +294,7 @@ const char *GDKversion(void); size_t GDKvm_cursize(void); void *GDKzalloc(size_t size) __attribute__((__malloc__)) __attribute__((__alloc_size__(1))) __attribute__((__warn_unused_result__)); void HASHdestroy(BAT *b); +gdk_return HASHgrowbucket(BAT *b); BUN HASHlist(Hash *h, BUN i); BUN HASHprobe(const Hash *h, const void *v); void HEAP_free(Heap *heap, var_t block); diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -2625,7 +2625,7 @@ gdk_export void VIEWbounds(BAT *b, BAT * enum prop_t { GDK_MIN_VALUE = 3, /* smallest non-nil value in BAT */ GDK_MAX_VALUE, /* largest non-nil value in BAT */ - GDK_HASH_MASK, /* last used hash mask */ + GDK_HASH_BUCKETS, /* last used hash bucket size */ }; /* diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c --- a/gdk/gdk_bat.c +++ b/gdk/gdk_bat.c @@ -452,9 +452,6 @@ BATextend(BAT *b, BUN newcap) if (b->theap.base && HEAPextend(&b->theap, theap_size, b->batRestricted == BAT_READ) != GDK_SUCCEED) return GDK_FAIL; - HASHdestroy(b); - IMPSdestroy(b); - OIDXdestroy(b); return GDK_SUCCEED; } @@ -1082,7 +1079,7 @@ BUNappend(BAT *b, const void *t, bool fo for (prop = b->tprops; prop; prop = prop->next) if (prop->id != GDK_MAX_VALUE && prop->id != GDK_MIN_VALUE && - prop->id != GDK_HASH_MASK) { + prop->id != GDK_HASH_BUCKETS) { BATrmprop(b, prop->id); break; } @@ -1164,7 +1161,7 @@ BUNdelete(BAT *b, oid o) for (prop = b->tprops; prop; prop = prop->next) if (prop->id != GDK_MAX_VALUE && prop->id != GDK_MIN_VALUE && - prop->id != GDK_HASH_MASK) { + prop->id != GDK_HASH_BUCKETS) { BATrmprop(b, prop->id); break; } @@ -1252,7 +1249,7 @@ BUNinplace(BAT *b, BUN p, const void *t, for (prop = b->tprops; prop; prop = prop->next) if (prop->id != GDK_MAX_VALUE && prop->id != GDK_MIN_VALUE && - prop->id != GDK_HASH_MASK) { + prop->id != GDK_HASH_BUCKETS) { BATrmprop(b, prop->id); break; } @@ -2400,13 +2397,13 @@ BATassertProps(BAT *b) seenmin |= cmp == 0; } prb = HASHprobe(hs, valp); - for (hb = HASHget(hs,prb); + for (hb = HASHget(hs, prb); hb != HASHnil(hs); - hb = HASHgetlink(hs,hb)) + hb = HASHgetlink(hs, hb)) if (cmpf(valp, BUNtail(bi, hb)) == 0) assert(!b->tkey); - HASHputlink(hs,p, HASHget(hs,prb)); - HASHput(hs,prb,p); + HASHputlink(hs, p, HASHget(hs, prb)); + HASHput(hs, prb, p); assert(!b->tnonil || !isnil); seennil |= isnil; } diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c --- a/gdk/gdk_batop.c +++ b/gdk/gdk_batop.c @@ -561,19 +561,14 @@ BATappend(BAT *b, BAT *n, BAT *s, bool f for (prop = b->tprops; prop; prop = prop->next) if (prop->id != GDK_MAX_VALUE && prop->id != GDK_MIN_VALUE && - prop->id != GDK_HASH_MASK) { + prop->id != GDK_HASH_BUCKETS) { BATrmprop(b, prop->id); break; } } while (prop); #endif - if (b->thash == (Hash *) 1 || BATcount(b) == 0 || - (b->thash && ((size_t *) b->thash->heapbckt.base)[0] & (1 << 24))) { - /* don't bother first loading the hash to then change - * it, or updating the hash if we replace the heap, - * also, we cannot maintain persistent hashes */ - HASHdestroy(b); - } + /* load hash so that we can maintain it */ + (void) BATcheckhash(b); if (b->ttype == TYPE_void) { /* b does not have storage, keep it that way if we can */ @@ -615,7 +610,7 @@ BATappend(BAT *b, BAT *n, BAT *s, bool f MT_lock_set(&b->batIdxLock); if (b->thash == (Hash *) 1 || (b->thash != NULL && - (2 * b->thash->mask) < (BATcount(b) + cnt))) { + (2 * NHASHBUCKETS(b->thash)) < (BATcount(b) + cnt))) { MT_lock_unset(&b->batIdxLock); HASHdestroy(b); } else { diff --git a/gdk/gdk_hash.c b/gdk/gdk_hash.c --- a/gdk/gdk_hash.c +++ b/gdk/gdk_hash.c @@ -84,11 +84,39 @@ HASHclear(Hash *h) * rather than iteratively assigning individual * BUNi_NONE values in a for-loop */ - memset(h->Hash, 0xFF, (h->mask + 1) * h->width); + memset(h->Bckt, 0xFF, NHASHBUCKETS(h) * h->width); } -#define HASH_VERSION 2 -#define HASH_HEADER_SIZE 6 /* nr of size_t fields in header */ +#define HASH_VERSION 3 +#define HASH_HEADER_SIZE 8 /* nr of size_t fields in header */ + +/* calculate (uint8_t) floor(log2(mask)) */ +static inline uint8_t +HASHmasklevel(BUN mask) +{ + assert(mask != 0); + +#if SIZEOF_BUN == SIZEOF_INT && defined(__GNUC__) + return (uint8_t) (31 - __builtin_clz((unsigned int) mask)); +#elif SIZEOF_BUN == SIZEOF_INT && defined(_MSC_VER) + return (uint8_t) (31 - __lzcnt((unsigned int) mask)); +#elif SIZEOF_BUN == SIZEOF_LONG && defined(__GNUC__) + return (uint8_t) (63 - __builtin_clzl((unsigned long) mask)); +#elif SIZEOF_BUN == SIZEOF_LONG && defined(_MSC_VER) + return (uint8_t) (63 - __lzcnt64((unsigned __int64) mask)); +#else + uint8_t n = 0; + uint8_t c = SIZEOF_BUN * 4; + do { + BUN y = mask >> c; + if (y) { + n += c; + mask = y; + } + } while ((c >>= 1) != 0); + return n; +#endif +} gdk_return HASHnew(Hash *h, int tpe, BUN size, BUN mask, BUN count, bool bcktonly) @@ -105,7 +133,8 @@ HASHnew(Hash *h, int tpe, BUN size, BUN if (HEAPalloc(&h->heapbckt, mask + HASH_HEADER_SIZE * SIZEOF_SIZE_T / h->width, h->width) != GDK_SUCCEED) return GDK_FAIL; h->heapbckt.free = mask * h->width + HASH_HEADER_SIZE * SIZEOF_SIZE_T; - h->mask = mask - 1; + h->level = HASHmasklevel(mask); + h->split = mask - ((BUN) 1 << h->level); switch (h->width) { case BUN2: h->nil = (BUN) BUN2_NONE; @@ -121,19 +150,129 @@ HASHnew(Hash *h, int tpe, BUN size, BUN default: assert(0); } - h->Hash = h->heapbckt.base + HASH_HEADER_SIZE * SIZEOF_SIZE_T; + h->Bckt = h->heapbckt.base + HASH_HEADER_SIZE * SIZEOF_SIZE_T; h->type = tpe; HASHclear(h); /* zero the mask */ ((size_t *) h->heapbckt.base)[0] = HASH_VERSION; ((size_t *) h->heapbckt.base)[1] = size; - ((size_t *) h->heapbckt.base)[2] = mask; - ((size_t *) h->heapbckt.base)[3] = h->width; - ((size_t *) h->heapbckt.base)[4] = count; - ((size_t *) h->heapbckt.base)[5] = 0; /* # filled slots (chain heads) */ + ((size_t *) h->heapbckt.base)[2] = h->level; + ((size_t *) h->heapbckt.base)[3] = h->split; + ((size_t *) h->heapbckt.base)[4] = h->width; + ((size_t *) h->heapbckt.base)[5] = count; + ((size_t *) h->heapbckt.base)[6] = h->nunique; + ((size_t *) h->heapbckt.base)[7] = h->nheads; ACCELDEBUG fprintf(stderr, "#HASHnew: create hash(size " BUNFMT ", mask " BUNFMT ", width %d, total " BUNFMT " bytes);\n", size, mask, h->width, (size + mask) * h->width); return GDK_SUCCEED; } +gdk_return +HASHgrowbucket(BAT *b) +{ + switch (ATOMsize(b->ttype)) { + case 1: + case 2: + /* no need to grow bucket list */ + return GDK_SUCCEED; + default: + break; + } + + Hash *h = b->thash; + BUN nbucket = NHASHBUCKETS(h); + + if (h->nunique < nbucket * 3 / 4) + return GDK_SUCCEED; + + BUN old = h->split; + BUN new = ((BUN) 1 << h->level) + h->split; + BATiter bi = bat_iterator(b); + BUN msk = (BUN) 1 << h->level; + + ACCELDEBUG fprintf(stderr, "#%s: %s(" ALGOBATFMT ") -> " BUNFMT " buckets\n", + MT_thread_getname(), __func__, ALGOBATPAR(b), nbucket + 1); + assert(h->heapbckt.free == nbucket * h->width + HASH_HEADER_SIZE * SIZEOF_SIZE_T); + h->heapbckt.dirty = true; + if (((size_t *) h->heapbckt.base)[0] & (size_t) 1 << 24) { + /* persistent hash: remove persistency */ + ((size_t *) h->heapbckt.base)[0] &= ~((size_t) 1 << 24); + if (h->heapbckt.storage == STORE_MEM) { + int fd; + if ((fd = GDKfdlocate(h->heapbckt.farmid, + h->heapbckt.filename, + "rb+", NULL)) >= 0) { + if (write(fd, h->heapbckt.base, SIZEOF_SIZE_T) >= 0) { + if (!(GDKdebug & NOSYNCMASK)) { +#if defined(NATIVE_WIN32) + _commit(fd); +#elif defined(HAVE_FDATASYNC) + fdatasync(fd); +#elif defined(HAVE_FSYNC) + fsync(fd); +#endif + } + } + close(fd); + } + } else { + if (!(GDKdebug & NOSYNCMASK)) + MT_msync(h->heapbckt.base, SIZEOF_SIZE_T); + } + } + if (h->heapbckt.free + h->width > h->heapbckt.size) { + if (HEAPextend(&h->heapbckt, + h->heapbckt.size + GDK_mmap_pagesize, + true) != GDK_SUCCEED) { + return GDK_FAIL; + } + h->Bckt = h->heapbckt.base + HASH_HEADER_SIZE * SIZEOF_SIZE_T; + } + assert(h->heapbckt.free + h->width <= h->heapbckt.size); + if (++h->split == ((BUN) 1 << h->level)) { + h->level++; + h->split = 0; + } + h->heapbckt.free += h->width; + BUN lold, lnew, hb; + lold = lnew = HASHnil(h); + if ((hb = HASHget(h, old)) != HASHnil(h)) { + h->nheads--; + do { + const void *v = BUNtail(bi, hb); + BUN hsh = ATOMhash(h->type, v); + assert((hsh & (msk - 1)) == old); + if (hsh & msk) { + /* move to new list */ + if (lnew == HASHnil(h)) { + HASHput(h, new, hb); + h->nheads++; + } else + HASHputlink(h, lnew, hb); + lnew = hb; + } else { + /* move to old list */ + if (lold == HASHnil(h)) { + h->nheads++; + HASHput(h, old, hb); + } else + HASHputlink(h, lold, hb); + lold = hb; + } + hb = HASHgetlink(h, hb); + } while (hb != HASHnil(h)); + } + if (lnew == HASHnil(h)) + HASHput(h, new, HASHnil(h)); + else + HASHputlink(h, lnew, HASHnil(h)); + if (lold == HASHnil(h)) + HASHput(h, old, HASHnil(h)); + else + HASHputlink(h, lold, HASHnil(h)); + BATsetprop_nolock(b, GDK_HASH_BUCKETS, TYPE_oid, _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list