Changeset: 87653f83cbd5 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/87653f83cbd5 Modified Files: gdk/gdk.h gdk/gdk_atoms.h gdk/gdk_bat.c gdk/gdk_batop.c gdk/gdk_bbp.c gdk/gdk_group.c gdk/gdk_heap.c gdk/gdk_private.h gdk/gdk_project.c gdk/gdk_select.c gdk/gdk_string.c monetdb5/extras/rapi/rapi.c sql/backends/monet5/UDF/pyapi3/conversion3.c sql/storage/bat/bat_storage.c Branch: string-dedup Log Message:
Implemented full string deduplication using Robin Hood hashing + linear hashing. diffs (truncated from 1333 to 300 lines): diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -775,6 +775,7 @@ typedef struct BAT { MT_Lock theaplock; /* lock protecting heap reference changes */ MT_RWLock thashlock; /* lock specifically for hash management */ MT_Lock batIdxLock; /* lock to manipulate other indexes/properties */ + struct hash *strhash; /* hash structure for strings */ } BAT; /* macros to hide complexity of the BAT structure */ @@ -1676,7 +1677,7 @@ tfastins_nocheckVAR(BAT *b, BUN p, const if ((rc = ATOMputVAR(b, &d, v)) != GDK_SUCCEED) return rc; if (b->twidth < SIZEOF_VAR_T && - (b->twidth <= 2 ? d - GDK_VAROFFSET : d) >= ((size_t) 1 << (8 << b->tshift))) { + d >= ((size_t) 1 << (8 << b->tshift))) { /* doesn't fit in current heap, upgrade it */ rc = GDKupgradevarheap(b, d, 0, MAX(p, b->batCount)); if (rc != GDK_SUCCEED) @@ -1684,10 +1685,10 @@ tfastins_nocheckVAR(BAT *b, BUN p, const } switch (b->twidth) { case 1: - ((uint8_t *) b->theap->base)[p] = (uint8_t) (d - GDK_VAROFFSET); + ((uint8_t *) b->theap->base)[p] = (uint8_t) d; break; case 2: - ((uint16_t *) b->theap->base)[p] = (uint16_t) (d - GDK_VAROFFSET); + ((uint16_t *) b->theap->base)[p] = (uint16_t) d; break; case 4: ((uint32_t *) b->theap->base)[p] = (uint32_t) d; diff --git a/gdk/gdk_atoms.h b/gdk/gdk_atoms.h --- a/gdk/gdk_atoms.h +++ b/gdk/gdk_atoms.h @@ -379,11 +379,6 @@ ATOMreplaceVAR(BAT *b, var_t *dst, const #define GDK_STRHASHTABLE (1<<10) /* 1024 */ #define GDK_STRHASHMASK (GDK_STRHASHTABLE-1) #define GDK_STRHASHSIZE (GDK_STRHASHTABLE * sizeof(stridx_t)) -#define GDK_ELIMPOWER 16 /* 64KiB is the threshold */ -#define GDK_ELIMDOUBLES(h) ((h)->free < GDK_ELIMLIMIT) -#define GDK_ELIMLIMIT (1<<GDK_ELIMPOWER) /* equivalently: ELIMBASE == 0 */ -#define GDK_ELIMBASE(x) (((x) >> GDK_ELIMPOWER) << GDK_ELIMPOWER) -#define GDK_VAROFFSET ((var_t) GDK_STRHASHSIZE) /* * @- String Comparison, NILs and UTF-8 @@ -432,9 +427,9 @@ VarHeapVal(const void *b, BUN p, int w) { switch (w) { case 1: - return (size_t) ((const uint8_t *) b)[p] + GDK_VAROFFSET; + return (size_t) ((const uint8_t *) b)[p]; case 2: - return (size_t) ((const uint16_t *) b)[p] + GDK_VAROFFSET; + return (size_t) ((const uint16_t *) b)[p]; #if SIZEOF_VAR_T == 8 case 4: return (size_t) ((const uint32_t *) b)[p]; diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c --- a/gdk/gdk_bat.c +++ b/gdk/gdk_bat.c @@ -1434,10 +1434,10 @@ BUNinplacemulti(BAT *b, const oid *posit _ptr = BUNtloc(bi, p); switch (b->twidth) { default: /* only three or four cases possible */ - _d = (var_t) * (uint8_t *) _ptr + GDK_VAROFFSET; + _d = (var_t) * (uint8_t *) _ptr; break; case 2: - _d = (var_t) * (uint16_t *) _ptr + GDK_VAROFFSET; + _d = (var_t) * (uint16_t *) _ptr; break; case 4: _d = (var_t) * (uint32_t *) _ptr; @@ -1453,7 +1453,7 @@ BUNinplacemulti(BAT *b, const oid *posit return GDK_FAIL; } if (b->twidth < SIZEOF_VAR_T && - (b->twidth <= 2 ? _d - GDK_VAROFFSET : _d) >= ((size_t) 1 << (8 << b->tshift))) { + _d >= ((size_t) 1 << (8 << b->tshift))) { /* doesn't fit in current heap, upgrade it */ if (GDKupgradevarheap(b, _d, 0, bi.count) != GDK_SUCCEED) { MT_rwlock_wrunlock(&b->thashlock); @@ -1465,10 +1465,10 @@ BUNinplacemulti(BAT *b, const oid *posit _ptr = BUNtloc(bi, p); switch (b->twidth) { default: /* only three or four cases possible */ - * (uint8_t *) _ptr = (uint8_t) (_d - GDK_VAROFFSET); + * (uint8_t *) _ptr = (uint8_t) _d; break; case 2: - * (uint16_t *) _ptr = (uint16_t) (_d - GDK_VAROFFSET); + * (uint16_t *) _ptr = (uint16_t) _d; break; case 4: * (uint32_t *) _ptr = (uint32_t) _d; @@ -2138,7 +2138,7 @@ HEAPcommitpersistence(Heap *hp, bool wri } -#define ATOMappendpriv(t, h) (ATOMstorage(t) != TYPE_str || GDK_ELIMDOUBLES(h)) +#define ATOMappendpriv(t, h) (ATOMstorage(t) != TYPE_str) /* change the heap modes at a commit */ gdk_return diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c --- a/gdk/gdk_batop.c +++ b/gdk/gdk_batop.c @@ -68,7 +68,6 @@ insert_string_bat(BAT *b, BAT *n, struct unsigned int tiv; /* tail value-as-int */ #endif var_t v; /* value */ - size_t off; /* offset within n's string heap */ BUN cnt = ci->ncand; BUN oldcnt = BATcount(b); @@ -109,8 +108,7 @@ insert_string_bat(BAT *b, BAT *n, struct bat_iterator_end(&ni); return GDK_FAIL; } - if (oldcnt == 0 || (!GDK_ELIMDOUBLES(b->tvheap) && - !GDK_ELIMDOUBLES(ni.vh))) { + if (oldcnt == 0) { /* we'll consider copying the string heap completely * * we first estimate how much space the string heap @@ -125,13 +123,7 @@ insert_string_bat(BAT *b, BAT *n, struct len += strlen(BUNtvar(ni, p)) + 1; } len = (len + 512) / 1024; /* rounded average length */ - r = (GDK_ELIMLIMIT - GDK_STRHASHSIZE) / (len + 12); - /* r is estimate of number of strings in - * double-eliminated area */ - if (r < ci->ncand) - len = GDK_ELIMLIMIT + (ci->ncand - r) * len; - else - len = GDK_STRHASHSIZE + ci->ncand * (len + 12); + len = ci->ncand * len; /* len is total estimated expected size of vheap */ if (len > ni.vh->free / 2) { @@ -156,7 +148,7 @@ insert_string_bat(BAT *b, BAT *n, struct /* if toff has the initial value of ~0, we insert strings * individually, otherwise we only copy (insert) offsets */ if (toff == ~(size_t) 0) - v = GDK_VAROFFSET; + v = 1; else v = b->tvheap->free - 1; @@ -172,12 +164,11 @@ insert_string_bat(BAT *b, BAT *n, struct * values, so we can use fast memcpy */ MT_thread_setalgorithm("memcpy offsets"); memcpy(Tloc(b, BUNlast(b)), (const char *) ni.base + ((ci->seq - n->hseqbase) << ni.shift), cnt << ni.shift); - } else if (toff != ~(size_t) 0) { + } else if (toff == 0) { /* we don't need to insert any actual strings since we * have already made sure that they are all in b's - * string heap at known locations (namely the offset - * in n added to toff), so insert offsets from n after - * adding toff into b */ + * string heap at known locations, so insert offsets + * from n into b */ /* note the use of the "restrict" qualifier here: all * four pointers below point to the same value, but * only one of them will actually be used, hence we @@ -219,10 +210,10 @@ insert_string_bat(BAT *b, BAT *n, struct p = canditer_next(ci) - n->hseqbase; switch (ni.width) { case 1: - v = (var_t) tbp[p] + GDK_VAROFFSET; + v = (var_t) tbp[p]; break; case 2: - v = (var_t) tsp[p] + GDK_VAROFFSET; + v = (var_t) tsp[p]; break; #if SIZEOF_VAR_T == 8 case 4: @@ -233,17 +224,15 @@ insert_string_bat(BAT *b, BAT *n, struct v = tvp[p]; break; } - v = (var_t) ((size_t) v + toff); - assert(v >= GDK_VAROFFSET); assert((size_t) v < b->tvheap->free); switch (b->twidth) { case 1: - assert(v - GDK_VAROFFSET < ((var_t) 1 << 8)); - ((uint8_t *) b->theap->base)[r++] = (uint8_t) (v - GDK_VAROFFSET); + assert(v < ((var_t) 1 << 8)); + ((uint8_t *) b->theap->base)[r++] = (uint8_t) v; break; case 2: - assert(v - GDK_VAROFFSET < ((var_t) 1 << 16)); - ((uint16_t *) b->theap->base)[r++] = (uint16_t) (v - GDK_VAROFFSET); + assert(v < ((var_t) 1 << 16)); + ((uint16_t *) b->theap->base)[r++] = (uint16_t) v; break; #if SIZEOF_VAR_T == 8 case 4: @@ -256,13 +245,9 @@ insert_string_bat(BAT *b, BAT *n, struct break; } } - } else if (b->tvheap->free < ni.vh->free / 2 || - GDK_ELIMDOUBLES(b->tvheap)) { - /* if b's string heap is much smaller than n's string - * heap, don't bother checking whether n's string - * values occur in b's string heap; also, if b is - * (still) fully double eliminated, we must continue - * to use the double elimination mechanism */ + } else { + /* we must continue to use the double elimination + * mechanism by inserting individual strings */ r = b->batCount; oid hseq = n->hseqbase; MT_thread_setalgorithm("insert string values"); @@ -276,55 +261,6 @@ insert_string_bat(BAT *b, BAT *n, struct } r++; } - } else { - /* Insert values from n individually into b; however, - * we check whether there is a string in b's string - * heap at the same offset as the string is in n's - * string heap (in case b's string heap is a copy of - * n's). If this is the case, we just copy the - * offset, otherwise we insert normally. */ - r = b->batCount; - MT_thread_setalgorithm("insert string values with check"); - while (cnt > 0) { - cnt--; - p = canditer_next(ci) - n->hseqbase; - off = BUNtvaroff(ni, p); /* the offset */ - tp = ni.vh->base + off; /* the string */ - if (off < b->tvheap->free && - strcmp(b->tvheap->base + off, tp) == 0) { - /* we found the string at the same - * offset in b's string heap as it was - * in n's string heap, so we don't - * have to insert a new string into b: - * we can just copy the offset */ - v = (var_t) off; - switch (b->twidth) { - case 1: - assert(v - GDK_VAROFFSET < ((var_t) 1 << 8)); - ((uint8_t *) b->theap->base)[r] = (unsigned char) (v - GDK_VAROFFSET); - break; - case 2: - assert(v - GDK_VAROFFSET < ((var_t) 1 << 16)); - ((uint16_t *) b->theap->base)[r] = (unsigned short) (v - GDK_VAROFFSET); - break; -#if SIZEOF_VAR_T == 8 - case 4: - assert(v < ((var_t) 1 << 32)); - ((uint32_t *) b->theap->base)[r] = (unsigned int) v; - break; -#endif - default: - ((var_t *) b->theap->base)[r] = v; - break; - } - } else { - if (tfastins_nocheckVAR(b, r, tp) != GDK_SUCCEED) { - bat_iterator_end(&ni); - return GDK_FAIL; - } - } - r++; - } } BATsetcount(b, oldcnt + ci->ncand); bat_iterator_end(&ni); @@ -1241,10 +1177,10 @@ BATappend_or_update(BAT *b, BAT *p, cons var_t d; switch (b->twidth) { default: /* only three or four cases possible */ - d = (var_t) ((uint8_t *) b->theap->base)[updid] + GDK_VAROFFSET; + d = (var_t) ((uint8_t *) b->theap->base)[updid]; break; case 2: - d = (var_t) ((uint16_t *) b->theap->base)[updid] + GDK_VAROFFSET; + d = (var_t) ((uint16_t *) b->theap->base)[updid]; break; case 4: d = (var_t) ((uint32_t *) b->theap->base)[updid]; @@ -1259,7 +1195,7 @@ BATappend_or_update(BAT *b, BAT *p, cons goto bailout; } if (b->twidth < SIZEOF_VAR_T && - (b->twidth <= 2 ? d - GDK_VAROFFSET : d) >= ((size_t) 1 << (8 << b->tshift))) { + d >= ((size_t) 1 << (8 << b->tshift))) { /* doesn't fit in current heap, upgrade it */ if (GDKupgradevarheap(b, d, 0, MAX(updid, b->batCount)) != GDK_SUCCEED) { goto bailout; @@ -1271,10 +1207,10 @@ BATappend_or_update(BAT *b, BAT *p, cons bi = bat_iterator_nolock(b); _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list