Changeset: 6f7d34b56bed for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6f7d34b56bed
Modified Files:
        gdk/gdk_hash.c
        gdk/gdk_hash.h
Branch: linear-hashing
Log Message:

Use two masks instead of level + split.


diffs (229 lines):

diff --git a/gdk/gdk_hash.c b/gdk/gdk_hash.c
--- a/gdk/gdk_hash.c
+++ b/gdk/gdk_hash.c
@@ -49,13 +49,9 @@ HASHwidth(BUN hashsize)
 #endif
 }
 
-BUN
-HASHmask(BUN cnt)
+static inline BUN
+hashmask(BUN m)
 {
-       BUN m = cnt;
-
-#if 0
-       /* find largest power of 2 smaller than or equal to cnt */
        m |= m >> 1;
        m |= m >> 2;
        m |= m >> 4;
@@ -64,6 +60,17 @@ HASHmask(BUN cnt)
 #if SIZEOF_BUN == 8
        m |= m >> 32;
 #endif
+       return m;
+}
+
+BUN
+HASHmask(BUN cnt)
+{
+       BUN m = cnt;
+
+#if 0
+       /* find largest power of 2 smaller than or equal to cnt */
+       m = hashmask(m);
        m -= m >> 1;
 
        /* if cnt is more than 1/3 into the gap between m and 2*m,
@@ -91,35 +98,7 @@ HASHclear(Hash *h)
 }
 
 #define HASH_VERSION           3
-#define HASH_HEADER_SIZE       8       /* nr of size_t fields in header */
-
-/* calculate (uint8_t) floor(log2(mask)) */
-static inline uint8_t
-HASHmasklevel(BUN mask)
-{
-       assert(mask != 0);
-
-#if SIZEOF_BUN == SIZEOF_INT && defined(__GNUC__)
-       return (uint8_t) (31 - __builtin_clz((unsigned int) mask));
-#elif SIZEOF_BUN == SIZEOF_INT && defined(_MSC_VER)
-       return (uint8_t) (31 - __lzcnt((unsigned int) mask));
-#elif SIZEOF_BUN == SIZEOF_LONG && defined(__GNUC__)
-       return (uint8_t) (63 - __builtin_clzl((unsigned long) mask));
-#elif SIZEOF_BUN == SIZEOF_LONG && defined(_MSC_VER)
-       return (uint8_t) (63 - __lzcnt64((unsigned __int64) mask));
-#else
-       uint8_t n = 0;
-       uint8_t c = SIZEOF_BUN * 4;
-       do {
-               BUN y = mask >> c;
-               if (y) {
-                       n += c;
-                       mask = y;
-               }
-       } while ((c >>= 1) != 0);
-       return n;
-#endif
-}
+#define HASH_HEADER_SIZE       7       /* nr of size_t fields in header */
 
 static void
 doHASHdestroy(BAT *b, Hash *hs)
@@ -160,8 +139,15 @@ HASHnew(Hash *h, int tpe, BUN size, BUN 
        if (HEAPalloc(&h->heapbckt, mask + HASH_HEADER_SIZE * SIZEOF_SIZE_T / 
h->width, h->width) != GDK_SUCCEED)
                return GDK_FAIL;
        h->heapbckt.free = mask * h->width + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
-       h->level = HASHmasklevel(mask);
-       h->split = mask - ((BUN) 1 << h->level);
+       h->nbucket = mask;
+       if (mask & (mask - 1)) {
+               h->mask2 = hashmask(mask);
+               h->mask1 = h->mask2 >> 1;
+       } else {
+               /* mask is a power of two */
+               h->mask1 = mask - 1;
+               h->mask2 = h->mask1 << 1 | 1;
+       }
        switch (h->width) {
        case BUN2:
                h->nil = (BUN) BUN2_NONE;
@@ -182,12 +168,11 @@ HASHnew(Hash *h, int tpe, BUN size, BUN 
        HASHclear(h);           /* zero the mask */
        ((size_t *) h->heapbckt.base)[0] = (size_t) HASH_VERSION;
        ((size_t *) h->heapbckt.base)[1] = (size_t) size;
-       ((size_t *) h->heapbckt.base)[2] = (size_t) h->level;
-       ((size_t *) h->heapbckt.base)[3] = (size_t) h->split;
-       ((size_t *) h->heapbckt.base)[4] = (size_t) h->width;
-       ((size_t *) h->heapbckt.base)[5] = (size_t) count;
-       ((size_t *) h->heapbckt.base)[6] = (size_t) h->nunique;
-       ((size_t *) h->heapbckt.base)[7] = (size_t) h->nheads;
+       ((size_t *) h->heapbckt.base)[2] = (size_t) h->nbucket;
+       ((size_t *) h->heapbckt.base)[3] = (size_t) h->width;
+       ((size_t *) h->heapbckt.base)[4] = (size_t) count;
+       ((size_t *) h->heapbckt.base)[5] = (size_t) h->nunique;
+       ((size_t *) h->heapbckt.base)[6] = (size_t) h->nheads;
        ACCELDEBUG fprintf(stderr, "#%s: HASHnew: create hash(size " BUNFMT ", 
mask " BUNFMT ", width %d, total " BUNFMT " bytes);\n", MT_thread_getname(), 
size, mask, h->width, (size + mask) * h->width);
        return GDK_SUCCEED;
 }
@@ -332,10 +317,10 @@ HASHgrowbucket(BAT *b)
                }
        }
        while (h->nunique >= (nbucket = NHASHBUCKETS(h)) * 7 / 8) {
-               BUN old = h->split;
-               BUN new = ((BUN) 1 << h->level) + h->split;
+               BUN new = h->nbucket;
+               BUN old = new & h->mask1;
                BATiter bi = bat_iterator(b);
-               BUN msk = (BUN) 1 << h->level;
+               BUN msk = h->mask1 + 1; /* == h->mask2 - h->mask1 */
 
                assert(h->heapbckt.free == nbucket * h->width + 
HASH_HEADER_SIZE * SIZEOF_SIZE_T);
                if (h->heapbckt.free + h->width > h->heapbckt.size) {
@@ -347,10 +332,11 @@ HASHgrowbucket(BAT *b)
                        h->Bckt = h->heapbckt.base + HASH_HEADER_SIZE * 
SIZEOF_SIZE_T;
                }
                assert(h->heapbckt.free + h->width <= h->heapbckt.size);
-               if (++h->split == ((BUN) 1 << h->level)) {
-                       h->level++;
-                       h->split = 0;
+               if (h->nbucket == h->mask2) {
+                       h->mask1 = h->mask2;
+                       h->mask2 = h->mask2 << 1 | 1;
                }
+               h->nbucket++;
                h->heapbckt.free += h->width;
                BUN lold, lnew, hb;
                lold = lnew = HASHnil(h);
@@ -450,9 +436,9 @@ BATcheckhash(BAT *b)
                                                    ((size_t) 1 << 24) |
 #endif
                                                    HASH_VERSION) &&
-                                           hdata[5] == (size_t) BATcount(b) &&
+                                           hdata[4] == (size_t) BATcount(b) &&
                                            fstat(fd, &st) == 0 &&
-                                           st.st_size >= (off_t) 
(h->heapbckt.size = h->heapbckt.free = (((BUN) 1 << (h->level = (uint8_t) 
hdata[2])) + (h->split = (BUN) hdata[3])) * (BUN) (h->width = (uint8_t) 
hdata[4]) + HASH_HEADER_SIZE * SIZEOF_SIZE_T) &&
+                                           st.st_size >= (off_t) 
(h->heapbckt.size = h->heapbckt.free = (h->nbucket = (BUN) hdata[2]) * (BUN) 
(h->width = (uint8_t) hdata[3]) + HASH_HEADER_SIZE * SIZEOF_SIZE_T) &&
                                            close(fd) == 0 &&
                                            (fd = 
GDKfdlocate(h->heaplink.farmid, nme, "rb+", "thashl")) >= 0 &&
                                            fstat(fd, &st) == 0 &&
@@ -460,8 +446,15 @@ BATcheckhash(BAT *b)
                                            st.st_size >= (off_t) 
(h->heaplink.size = h->heaplink.free = hdata[1] * h->width) &&
                                            HEAPload(&h->heaplink, nme, 
"thashl", false) == GDK_SUCCEED &&
                                            HEAPload(&h->heapbckt, nme, 
"thashb", false) == GDK_SUCCEED) {
-                                               h->nunique = hdata[6];
-                                               h->nheads = hdata[7];
+                                               if (h->nbucket & (h->nbucket - 
1)) {
+                                                       h->mask2 = 
hashmask(h->nbucket);
+                                                       h->mask1 = h->mask2 >> 
1;
+                                               } else {
+                                                       h->mask1 = h->nbucket - 
1;
+                                                       h->mask2 = h->mask1 << 
1 | 1;
+                                               }
+                                               h->nunique = hdata[5];
+                                               h->nheads = hdata[6];
                                                h->type = ATOMtype(b->ttype);
                                                switch (h->width) {
                                                case BUN2:
@@ -538,12 +531,11 @@ BAThashsave(BAT *b, bool dosync)
                 * mean time */
                ((size_t *) hp->base)[0] = (size_t) HASH_VERSION;
                ((size_t *) hp->base)[1] = (size_t) (h->heaplink.free / 
h->width);
-               ((size_t *) hp->base)[2] = (size_t) h->level;
-               ((size_t *) hp->base)[3] = (size_t) h->split;
-               ((size_t *) hp->base)[4] = (size_t) h->width;
-               ((size_t *) hp->base)[5] = (size_t) BATcount(b);
-               ((size_t *) hp->base)[6] = (size_t) h->nunique;
-               ((size_t *) hp->base)[7] = (size_t) h->nheads;
+               ((size_t *) hp->base)[2] = (size_t) h->nbucket;
+               ((size_t *) hp->base)[3] = (size_t) h->width;
+               ((size_t *) hp->base)[4] = (size_t) BATcount(b);
+               ((size_t *) hp->base)[5] = (size_t) h->nunique;
+               ((size_t *) hp->base)[6] = (size_t) h->nheads;
                if (!b->theap.dirty &&
                    HEAPsave(&h->heaplink, h->heaplink.filename, NULL, dosync) 
== GDK_SUCCEED &&
                    HEAPsave(hp, hp->filename, NULL, dosync) == GDK_SUCCEED) {
diff --git a/gdk/gdk_hash.h b/gdk/gdk_hash.h
--- a/gdk/gdk_hash.h
+++ b/gdk/gdk_hash.h
@@ -23,8 +23,9 @@
 typedef struct Hash {
        int type;               /* type of index entity */
        uint8_t width;          /* width of hash entries */
-       uint8_t level;          /* mask is (1<<.level)-1 or (2<<.level)-1 */
-       BUN split;              /* ... depending on hash value and .split */
+       BUN mask1;              /* .mask1 < .nbucket <= .mask2 */
+       BUN mask2;
+       BUN nbucket;            /* ... depending on hash value and .nbucket */
        BUN nil;                /* nil representation */
        BUN nunique;            /* number of unique values */
        BUN nheads;             /* number of chain heads */
@@ -33,13 +34,11 @@ typedef struct Hash {
        Heap heaplink;          /* heap where the hash links are stored */
        Heap heapbckt;          /* heap where the hash buckets are stored */
 } Hash;
-#define NHASHBUCKETS(h)                (((BUN) 1 << (h)->level) + (h)->split)
+#define NHASHBUCKETS(h)                ((h)->nbucket)
 static inline BUN
 HASHbucket(const Hash *h, BUN v)
 {
-       return (v & (((BUN) 1 << h->level) - 1)) < h->split
-               ? v & (((BUN) 2 << h->level) - 1)
-               : v & (((BUN) 1 << h->level) - 1);
+       return (v &= h->mask2) < h->nbucket ? v : v & h->mask1;
 }
 
 gdk_export gdk_return BAThash(BAT *b);
@@ -176,8 +175,8 @@ HASHgetlink(Hash *h, BUN i)
 #define hash_loc(H,V)  hash_any(H,V)
 #define hash_var(H,V)  hash_any(H,V)
 #define hash_any(H,V)  HASHbucket(H, ATOMhash((H)->type, (V)))
-#define hash_bte(H,V)  (assert((H)->level >= 8), (BUN) mix_bte(*(const 
unsigned char*) (V)))
-#define hash_sht(H,V)  (assert((H)->level >= 16), (BUN) mix_sht(*(const 
unsigned short*) (V)))
+#define hash_bte(H,V)  (assert((H)->nbucket >= 256), (BUN) mix_bte(*(const 
unsigned char*) (V)))
+#define hash_sht(H,V)  (assert((H)->nbucket >= 65536), (BUN) mix_sht(*(const 
unsigned short*) (V)))
 #define hash_int(H,V)  HASHbucket(H, (BUN) mix_int(*(const unsigned int *) 
(V)))
 /* XXX return size_t-sized value for 8-byte oid? */
 #define hash_lng(H,V)  HASHbucket(H, (BUN) mix_lng(*(const ulng *) (V)))
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to