Changeset: d593e5eda67f for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/d593e5eda67f Modified Files: gdk/gdk_batop.c gdk/gdk_bbp.c gdk/gdk_private.h gdk/gdk_storage.c gdk/gdk_string.c testing/Mtest.py.in Branch: string-dedup Log Message:
Store probe sequence length in string hash buckets. diffs (truncated from 480 to 300 lines): diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c --- a/gdk/gdk_batop.c +++ b/gdk/gdk_batop.c @@ -2321,7 +2321,7 @@ BATsort(BAT **sorted, BAT **order, BAT * bn = BATproject(o, b); if (bn == NULL) goto error; - if (bn->ttype == TYPE_void || isVIEW(bn)) { + if (bn->ttype == TYPE_void || bn->theap->parentid != bn->batCacheid) { BAT *b2 = COLcopy(bn, ATOMtype(bn->ttype), true, TRANSIENT); BBPunfix(bn->batCacheid); bn = b2; @@ -2423,7 +2423,7 @@ BATsort(BAT **sorted, BAT **order, BAT * assert(g->ttype == TYPE_oid); grps = (oid *) Tloc(g, 0); prev = grps[0]; - if (BATmaterialize(bn) != GDK_SUCCEED) + if (bn->ttype == TYPE_void && BATmaterialize(bn) != GDK_SUCCEED) goto error; for (r = 0, p = 1, q = BATcount(g); p < q; p++) { if (grps[p] != prev) { diff --git a/gdk/gdk_bbp.c b/gdk/gdk_bbp.c --- a/gdk/gdk_bbp.c +++ b/gdk/gdk_bbp.c @@ -339,6 +339,8 @@ BBPselectfarm(role_t role, int type, enu if (hptype == orderidxheap) role = TRANSIENT; #endif + if (hptype == strhashheap) + role = TRANSIENT; /* for now */ for (i = 0; i < MAXFARMS; i++) if (BBPfarms[i].roles & (1U << (int) role)) return i; @@ -4035,6 +4037,9 @@ BBPdiskscan(const char *parent, size_t b #else delete = true; #endif + } else if (strncmp(p + 1, "tstrhash", 8) == 0) { + /* string hash, we rebuild */ + delete = true; } else if (strncmp(p + 1, "new", 3) != 0) { ok = false; } diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h --- a/gdk/gdk_private.h +++ b/gdk/gdk_private.h @@ -25,7 +25,8 @@ enum heaptype { varheap, hashheap, imprintsheap, - orderidxheap + orderidxheap, + strhashheap }; gdk_return ATOMheap(int id, Heap *hp, size_t cap) @@ -255,6 +256,8 @@ void settailname(Heap *restrict tail, co __attribute__((__visibility__("hidden"))); void strCleanHash(Heap *hp, bool rebuild) __attribute__((__visibility__("hidden"))); +void strDestroy(BAT *b) + __attribute__((__visibility__("hidden"))); gdk_return strHeap(Heap *d, size_t cap) __attribute__((__visibility__("hidden"))); var_t strLocate(BAT *b, const char *v) diff --git a/gdk/gdk_storage.c b/gdk/gdk_storage.c --- a/gdk/gdk_storage.c +++ b/gdk/gdk_storage.c @@ -1010,6 +1010,8 @@ BATdelete(BAT *b) } else { HEAPfree(b->tvheap, true); } + if (b->strhash) + strDestroy(b); } b->batCopiedtodisk = false; } diff --git a/gdk/gdk_string.c b/gdk/gdk_string.c --- a/gdk/gdk_string.c +++ b/gdk/gdk_string.c @@ -70,17 +70,20 @@ struct hash { BUN nbucket; BUN nentries; BUN growlim; - var_t *buckets; + uint64_t *buckets; Heap heap; }; -#define EMPTY ((var_t) -1) +#define EMPTY ((uint64_t) -1) #define OFFSET 0 +#define PSLSHIFT 40 +#define PSLMASK ((UINT64_C(1) << PSLSHIFT) - 1) static inline BUN hash_str(struct hash *h, const char *value) { - BUN hsh = strHash(value) & h->mask2; + BUN hsh = strHash(value); + hsh &= h->mask2; if (hsh >= h->nbucket) hsh &= h->mask1; return hsh; @@ -97,38 +100,37 @@ enter(BAT *b, const char *value) newsize += 64 * 1024; } while (pos + len > newsize); if (HEAPgrow(&b->theaplock, &b->tvheap, newsize, true) != GDK_SUCCEED) - return EMPTY; + return (var_t) -1; } memcpy(b->tvheap->base + pos, value, len); b->tvheap->free += len; + b->tvheap->dirty = true; return pos; } static inline void -reinsert(BAT *b, struct hash *h, BUN hsh, var_t pos) +reinsert(struct hash *h, BUN hsh, var_t pos) { - BUN psl = 0; - var_t swp = EMPTY; + uint64_t psl = 0; + uint64_t swp = EMPTY; for (;;) { - var_t bkt = h->buckets[hsh]; + uint64_t bkt = h->buckets[hsh]; if (bkt == EMPTY) { /* found an empty slot */ if (swp == EMPTY) { - swp = pos; + swp = (uint64_t) pos; } - h->buckets[hsh] = swp; + h->buckets[hsh] = swp | psl << PSLSHIFT; return; } - const char *v = b->tvheap->base + bkt; - BUN hsh2 = hash_str(h, v); - BUN psl2 = hsh < hsh2 ? hsh + h->nbucket - hsh2 : hsh - hsh2; + uint64_t psl2 = bkt >> PSLSHIFT; if (psl > psl2) { if (swp == EMPTY) { - swp = pos; + swp = (uint64_t) pos; } - h->buckets[hsh] = swp; - swp = bkt; + h->buckets[hsh] = swp | psl << PSLSHIFT; + swp = bkt & PSLMASK; psl = psl2; } if (++hsh == h->nbucket) @@ -138,60 +140,89 @@ reinsert(BAT *b, struct hash *h, BUN hsh } static inline void -rmstring(BAT *b, struct hash *h, BUN pos) +rmstring(struct hash *h, BUN pos) { h->buckets[pos] = EMPTY; for (;;) { BUN hsh = pos + 1; if (hsh >= h->nbucket) hsh = 0; - var_t p = h->buckets[hsh]; - if (p == EMPTY || hash_str(h, b->tvheap->base + p) == hsh) + uint64_t p = h->buckets[hsh]; + if (p == EMPTY) break; - h->buckets[pos] = p; + uint64_t psl = p >> PSLSHIFT; + if (psl == 0) + break; + h->buckets[pos] = (p & PSLMASK) | (psl - 1) << PSLSHIFT; h->buckets[hsh] = EMPTY; pos = hsh; } } +static inline void +incpsl(struct hash *h) +{ + uint64_t hsh = 0; + for (;;) { + uint64_t bkt = h->buckets[hsh]; + if (bkt == EMPTY) + break; + uint64_t psl = bkt >> PSLSHIFT; + if (psl <= hsh) + break; + h->buckets[hsh] = (bkt & PSLMASK) | (psl + 1) << PSLSHIFT; + if (++hsh == h->nbucket) + break; + } +} + static inline gdk_return grow1(BAT *b, struct hash *h) { - struct hash oh = *h; BUN oldmask1 = h->mask1; BUN oldnbucket = h->nbucket; - if ((h->nbucket + OFFSET) * sizeof(var_t) >= h->heap.size) { + if ((h->nbucket + OFFSET) * sizeof(uint64_t) >= h->heap.size) { if (HEAPextend(&h->heap, h->heap.size + 64 * 1024, true) != GDK_SUCCEED) return GDK_FAIL; - h->buckets = (var_t *) h->heap.base + OFFSET; + h->buckets = (uint64_t *) h->heap.base + OFFSET; } - /* before increment so that we use old hash */ - rmstring(b, h, h->nbucket); if (h->nbucket == h->mask2) { h->mask1 = h->mask2; h->mask2 |= h->mask2 << 1; /* extend with one bit */ } h->nbucket++; - h->heap.free = h->nbucket * sizeof(var_t); - h->growlim = (h->nbucket * 3) / 4; + h->heap.free = (h->nbucket + OFFSET) * sizeof(uint64_t); + h->growlim = (BUN) (h->nbucket * (0.75 - (h->nbucket & h->mask1) / (2.0 * h->mask2))); assert(h->mask1 < h->nbucket && h->nbucket <= h->mask2); + incpsl(h); + rmstring(h, h->nbucket - 1); + BUN hsh = oldnbucket & oldmask1; + uint64_t psl = 0; for (;;) { - BUN bkt = h->buckets[hsh]; + uint64_t bkt = h->buckets[hsh]; if (bkt == EMPTY) break; - const char *v = b->tvheap->base + bkt; - BUN c = strHash(v); - if ((c & h->mask2) == oldnbucket) { - /* this one should be moved */ - rmstring(b, &oh, hsh); - reinsert(b, h, oldnbucket, bkt); + uint64_t psl2 = bkt >> PSLSHIFT; + if (psl2 < psl) + break; + if (psl2 == psl) { + bkt &= PSLMASK; + const char *v = b->tvheap->base + bkt; + BUN c = strHash(v); + if ((c & h->mask2) == oldnbucket) { + /* this one should be moved */ + rmstring(h, hsh); + reinsert(h, oldnbucket, (var_t) bkt); + continue; + } } if (++hsh == oldnbucket) hsh = 0; + psl++; } return GDK_SUCCEED; } @@ -200,56 +231,51 @@ static var_t insert(BAT *b, struct hash *h, const char *value) { BUN hsh = hash_str(h, value); - BUN psl = 0; /* probe sequence length */ - var_t swp = EMPTY; /* swapped entry */ - var_t new = EMPTY; /* new entry */ + uint64_t psl = 0; /* probe sequence length */ + uint64_t swp = EMPTY; /* swapped entry */ + var_t new = (var_t) -1; /* new entry */ for (;;) { - var_t bkt = h->buckets[hsh]; + uint64_t bkt = h->buckets[hsh]; if (bkt == EMPTY) { /* found an empty slot */ if (swp == EMPTY) { - assert(new == EMPTY); + assert(new == (var_t) -1); /* didn't enter value yet, so do it now */ - new = swp = enter(b, value); - if (new == EMPTY) + new = enter(b, value); + if (new == (var_t) -1) return (var_t) -1; + swp = (uint64_t) new; h->nentries++; - /* XXX maybe grow hash table (but first - * write h->buckets[hsh]) */ - h->buckets[hsh] = swp; - while (h->nentries == h->growlim) - if (grow1(b, h) != GDK_SUCCEED) - return (var_t) -1; - } else { - h->buckets[hsh] = swp; } + assert(new != (var_t) -1); _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list