Changeset: 6f7d34b56bed for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6f7d34b56bed Modified Files: gdk/gdk_hash.c gdk/gdk_hash.h Branch: linear-hashing Log Message:
Use two masks instead of level + split. diffs (229 lines): diff --git a/gdk/gdk_hash.c b/gdk/gdk_hash.c --- a/gdk/gdk_hash.c +++ b/gdk/gdk_hash.c @@ -49,13 +49,9 @@ HASHwidth(BUN hashsize) #endif } -BUN -HASHmask(BUN cnt) +static inline BUN +hashmask(BUN m) { - BUN m = cnt; - -#if 0 - /* find largest power of 2 smaller than or equal to cnt */ m |= m >> 1; m |= m >> 2; m |= m >> 4; @@ -64,6 +60,17 @@ HASHmask(BUN cnt) #if SIZEOF_BUN == 8 m |= m >> 32; #endif + return m; +} + +BUN +HASHmask(BUN cnt) +{ + BUN m = cnt; + +#if 0 + /* find largest power of 2 smaller than or equal to cnt */ + m = hashmask(m); m -= m >> 1; /* if cnt is more than 1/3 into the gap between m and 2*m, @@ -91,35 +98,7 @@ HASHclear(Hash *h) } #define HASH_VERSION 3 -#define HASH_HEADER_SIZE 8 /* nr of size_t fields in header */ - -/* calculate (uint8_t) floor(log2(mask)) */ -static inline uint8_t -HASHmasklevel(BUN mask) -{ - assert(mask != 0); - -#if SIZEOF_BUN == SIZEOF_INT && defined(__GNUC__) - return (uint8_t) (31 - __builtin_clz((unsigned int) mask)); -#elif SIZEOF_BUN == SIZEOF_INT && defined(_MSC_VER) - return (uint8_t) (31 - __lzcnt((unsigned int) mask)); -#elif SIZEOF_BUN == SIZEOF_LONG && defined(__GNUC__) - return (uint8_t) (63 - __builtin_clzl((unsigned long) mask)); -#elif SIZEOF_BUN == SIZEOF_LONG && defined(_MSC_VER) - return (uint8_t) (63 - __lzcnt64((unsigned __int64) mask)); -#else - uint8_t n = 0; - uint8_t c = SIZEOF_BUN * 4; - do { - BUN y = mask >> c; - if (y) { - n += c; - mask = y; - } - } while ((c >>= 1) != 0); - return n; -#endif -} +#define HASH_HEADER_SIZE 7 /* nr of size_t fields in header */ static void doHASHdestroy(BAT *b, Hash *hs) @@ -160,8 +139,15 @@ HASHnew(Hash *h, int tpe, BUN size, BUN if (HEAPalloc(&h->heapbckt, mask + HASH_HEADER_SIZE * SIZEOF_SIZE_T / h->width, h->width) != GDK_SUCCEED) return GDK_FAIL; h->heapbckt.free = mask * h->width + HASH_HEADER_SIZE * SIZEOF_SIZE_T; - h->level = HASHmasklevel(mask); - h->split = mask - ((BUN) 1 << h->level); + h->nbucket = mask; + if (mask & (mask - 1)) { + h->mask2 = hashmask(mask); + h->mask1 = h->mask2 >> 1; + } else { + /* mask is a power of two */ + h->mask1 = mask - 1; + h->mask2 = h->mask1 << 1 | 1; + } switch (h->width) { case BUN2: h->nil = (BUN) BUN2_NONE; @@ -182,12 +168,11 @@ HASHnew(Hash *h, int tpe, BUN size, BUN HASHclear(h); /* zero the mask */ ((size_t *) h->heapbckt.base)[0] = (size_t) HASH_VERSION; ((size_t *) h->heapbckt.base)[1] = (size_t) size; - ((size_t *) h->heapbckt.base)[2] = (size_t) h->level; - ((size_t *) h->heapbckt.base)[3] = (size_t) h->split; - ((size_t *) h->heapbckt.base)[4] = (size_t) h->width; - ((size_t *) h->heapbckt.base)[5] = (size_t) count; - ((size_t *) h->heapbckt.base)[6] = (size_t) h->nunique; - ((size_t *) h->heapbckt.base)[7] = (size_t) h->nheads; + ((size_t *) h->heapbckt.base)[2] = (size_t) h->nbucket; + ((size_t *) h->heapbckt.base)[3] = (size_t) h->width; + ((size_t *) h->heapbckt.base)[4] = (size_t) count; + ((size_t *) h->heapbckt.base)[5] = (size_t) h->nunique; + ((size_t *) h->heapbckt.base)[6] = (size_t) h->nheads; ACCELDEBUG fprintf(stderr, "#%s: HASHnew: create hash(size " BUNFMT ", mask " BUNFMT ", width %d, total " BUNFMT " bytes);\n", MT_thread_getname(), size, mask, h->width, (size + mask) * h->width); return GDK_SUCCEED; } @@ -332,10 +317,10 @@ HASHgrowbucket(BAT *b) } } while (h->nunique >= (nbucket = NHASHBUCKETS(h)) * 7 / 8) { - BUN old = h->split; - BUN new = ((BUN) 1 << h->level) + h->split; + BUN new = h->nbucket; + BUN old = new & h->mask1; BATiter bi = bat_iterator(b); - BUN msk = (BUN) 1 << h->level; + BUN msk = h->mask1 + 1; /* == h->mask2 - h->mask1 */ assert(h->heapbckt.free == nbucket * h->width + HASH_HEADER_SIZE * SIZEOF_SIZE_T); if (h->heapbckt.free + h->width > h->heapbckt.size) { @@ -347,10 +332,11 @@ HASHgrowbucket(BAT *b) h->Bckt = h->heapbckt.base + HASH_HEADER_SIZE * SIZEOF_SIZE_T; } assert(h->heapbckt.free + h->width <= h->heapbckt.size); - if (++h->split == ((BUN) 1 << h->level)) { - h->level++; - h->split = 0; + if (h->nbucket == h->mask2) { + h->mask1 = h->mask2; + h->mask2 = h->mask2 << 1 | 1; } + h->nbucket++; h->heapbckt.free += h->width; BUN lold, lnew, hb; lold = lnew = HASHnil(h); @@ -450,9 +436,9 @@ BATcheckhash(BAT *b) ((size_t) 1 << 24) | #endif HASH_VERSION) && - hdata[5] == (size_t) BATcount(b) && + hdata[4] == (size_t) BATcount(b) && fstat(fd, &st) == 0 && - st.st_size >= (off_t) (h->heapbckt.size = h->heapbckt.free = (((BUN) 1 << (h->level = (uint8_t) hdata[2])) + (h->split = (BUN) hdata[3])) * (BUN) (h->width = (uint8_t) hdata[4]) + HASH_HEADER_SIZE * SIZEOF_SIZE_T) && + st.st_size >= (off_t) (h->heapbckt.size = h->heapbckt.free = (h->nbucket = (BUN) hdata[2]) * (BUN) (h->width = (uint8_t) hdata[3]) + HASH_HEADER_SIZE * SIZEOF_SIZE_T) && close(fd) == 0 && (fd = GDKfdlocate(h->heaplink.farmid, nme, "rb+", "thashl")) >= 0 && fstat(fd, &st) == 0 && @@ -460,8 +446,15 @@ BATcheckhash(BAT *b) st.st_size >= (off_t) (h->heaplink.size = h->heaplink.free = hdata[1] * h->width) && HEAPload(&h->heaplink, nme, "thashl", false) == GDK_SUCCEED && HEAPload(&h->heapbckt, nme, "thashb", false) == GDK_SUCCEED) { - h->nunique = hdata[6]; - h->nheads = hdata[7]; + if (h->nbucket & (h->nbucket - 1)) { + h->mask2 = hashmask(h->nbucket); + h->mask1 = h->mask2 >> 1; + } else { + h->mask1 = h->nbucket - 1; + h->mask2 = h->mask1 << 1 | 1; + } + h->nunique = hdata[5]; + h->nheads = hdata[6]; h->type = ATOMtype(b->ttype); switch (h->width) { case BUN2: @@ -538,12 +531,11 @@ BAThashsave(BAT *b, bool dosync) * mean time */ ((size_t *) hp->base)[0] = (size_t) HASH_VERSION; ((size_t *) hp->base)[1] = (size_t) (h->heaplink.free / h->width); - ((size_t *) hp->base)[2] = (size_t) h->level; - ((size_t *) hp->base)[3] = (size_t) h->split; - ((size_t *) hp->base)[4] = (size_t) h->width; - ((size_t *) hp->base)[5] = (size_t) BATcount(b); - ((size_t *) hp->base)[6] = (size_t) h->nunique; - ((size_t *) hp->base)[7] = (size_t) h->nheads; + ((size_t *) hp->base)[2] = (size_t) h->nbucket; + ((size_t *) hp->base)[3] = (size_t) h->width; + ((size_t *) hp->base)[4] = (size_t) BATcount(b); + ((size_t *) hp->base)[5] = (size_t) h->nunique; + ((size_t *) hp->base)[6] = (size_t) h->nheads; if (!b->theap.dirty && HEAPsave(&h->heaplink, h->heaplink.filename, NULL, dosync) == GDK_SUCCEED && HEAPsave(hp, hp->filename, NULL, dosync) == GDK_SUCCEED) { diff --git a/gdk/gdk_hash.h b/gdk/gdk_hash.h --- a/gdk/gdk_hash.h +++ b/gdk/gdk_hash.h @@ -23,8 +23,9 @@ typedef struct Hash { int type; /* type of index entity */ uint8_t width; /* width of hash entries */ - uint8_t level; /* mask is (1<<.level)-1 or (2<<.level)-1 */ - BUN split; /* ... depending on hash value and .split */ + BUN mask1; /* .mask1 < .nbucket <= .mask2 */ + BUN mask2; + BUN nbucket; /* ... depending on hash value and .nbucket */ BUN nil; /* nil representation */ BUN nunique; /* number of unique values */ BUN nheads; /* number of chain heads */ @@ -33,13 +34,11 @@ typedef struct Hash { Heap heaplink; /* heap where the hash links are stored */ Heap heapbckt; /* heap where the hash buckets are stored */ } Hash; -#define NHASHBUCKETS(h) (((BUN) 1 << (h)->level) + (h)->split) +#define NHASHBUCKETS(h) ((h)->nbucket) static inline BUN HASHbucket(const Hash *h, BUN v) { - return (v & (((BUN) 1 << h->level) - 1)) < h->split - ? v & (((BUN) 2 << h->level) - 1) - : v & (((BUN) 1 << h->level) - 1); + return (v &= h->mask2) < h->nbucket ? v : v & h->mask1; } gdk_export gdk_return BAThash(BAT *b); @@ -176,8 +175,8 @@ HASHgetlink(Hash *h, BUN i) #define hash_loc(H,V) hash_any(H,V) #define hash_var(H,V) hash_any(H,V) #define hash_any(H,V) HASHbucket(H, ATOMhash((H)->type, (V))) -#define hash_bte(H,V) (assert((H)->level >= 8), (BUN) mix_bte(*(const unsigned char*) (V))) -#define hash_sht(H,V) (assert((H)->level >= 16), (BUN) mix_sht(*(const unsigned short*) (V))) +#define hash_bte(H,V) (assert((H)->nbucket >= 256), (BUN) mix_bte(*(const unsigned char*) (V))) +#define hash_sht(H,V) (assert((H)->nbucket >= 65536), (BUN) mix_sht(*(const unsigned short*) (V))) #define hash_int(H,V) HASHbucket(H, (BUN) mix_int(*(const unsigned int *) (V))) /* XXX return size_t-sized value for 8-byte oid? */ #define hash_lng(H,V) HASHbucket(H, (BUN) mix_lng(*(const ulng *) (V))) _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list