Changeset: f054b7310be7 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f054b7310be7 Added Files: monetdb5/modules/atoms/Tests/strappend.malC monetdb5/modules/atoms/Tests/strappend.stable.err monetdb5/modules/atoms/Tests/strappend.stable.out Modified Files: gdk/ChangeLog.Dec2016 gdk/gdk.h gdk/gdk_batop.c gdk/gdk_bbp.c gdk/gdk_join.c monetdb5/mal/mal_client.c monetdb5/modules/atoms/Tests/All monetdb5/optimizer/opt_pipes.c sql/backends/monet5/sql_statistics.c Branch: default Log Message:
Merge with Dec2016 branch. NOTE: YOUR DATABASES THAT YOU CREATED IN THE DEFAULT BRANCH CANNOT BE USED. In the Dec2016 branch the BBP version number was increased in order to fix a bug where the empty string might occur with different offsets in a duplicate-eliminated string heap. This conflicts with a version number increase that had been done in the default branch. This means that your default databases are now incompatible with the current code. Dump before you build, and restore after (or simply throw them away). diffs (truncated from 627 to 300 lines): diff --git a/gdk/ChangeLog.Dec2016 b/gdk/ChangeLog.Dec2016 --- a/gdk/ChangeLog.Dec2016 +++ b/gdk/ChangeLog.Dec2016 @@ -1,3 +1,12 @@ # ChangeLog file for MonetDB # This file is updated with Maddlog +* Tue Feb 28 2017 Sjoerd Mullender <sjo...@acm.org> +- Fixed a bug when appending string bats that are fully duplicate + eliminated. It could happend that the to-be-appended bat had an empty + string at an offset and at that same offset in the to-be-appended-to bat + there happened to be a (sequence of) NULL(s). Then this offset would be + used, even though it might nog be the right offset for the empty string + in the to-be-appended-to bat. This would result in multiple offsets for + the empty string, breaking the promise of being duplicate eliminated. + diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -854,8 +854,9 @@ typedef struct { #define GDKLIBRARY_INSERTED 061032 /* inserted and deleted in BBP.dir */ #define GDKLIBRARY_HEADED 061033 /* head properties are stored */ #define GDKLIBRARY_NOKEY 061034 /* nokey values can't be trusted */ -#define GDKLIBRARY_TALIGN 061035 /* talign field in BBP.dir */ -#define GDKLIBRARY 061036 +#define GDKLIBRARY_BADEMPTY 061035 /* possibility of duplicate empty str */ +#define GDKLIBRARY_TALIGN 061036 /* talign field in BBP.dir */ +#define GDKLIBRARY 061037 typedef struct BAT { /* static bat properties */ diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c --- a/gdk/gdk_batop.c +++ b/gdk/gdk_batop.c @@ -301,10 +301,13 @@ insert_string_bat(BAT *b, BAT *n, BAT *s } bunfastapp(b, tp); } - } else if (b->tvheap->free < n->tvheap->free / 2) { + } else if (b->tvheap->free < n->tvheap->free / 2 || + GDK_ELIMDOUBLES(b->tvheap)) { /* if b's string heap is much smaller than n's string * heap, don't bother checking whether n's string - * values occur in b's string heap */ + * values occur in b's string heap; also, if b is + * (still) fully double eliminated, we must continue + * to use the double elimination mechanism */ r = BUNlast(b); if (cand) { oid hseq = n->hseqbase; diff --git a/gdk/gdk_bbp.c b/gdk/gdk_bbp.c --- a/gdk/gdk_bbp.c +++ b/gdk/gdk_bbp.c @@ -629,6 +629,287 @@ fixwkbheap(void) } #endif +#ifdef GDKLIBRARY_BADEMPTY +/* There was a bug (fixed in changeset 1f5498568a24) which could + * result in empty strings not being double-eliminated. This code + * fixes the affected bats. + * Note that we only fix BATs whose string heap is still fully double + * eliminated. */ +static inline int +offsearch(const int *restrict offsets, int noffsets, int val) +{ + /* binary search on offsets for val, return whether present */ + int lo = 0, hi = noffsets - 1, mid; + + while (hi > lo) { + mid = (lo + hi) / 2; + if (offsets[mid] == val) + return 1; + if (offsets[mid] < val) + lo = mid + 1; + else + hi = mid - 1; + } + return offsets[lo] == val; +} + +static void +fixstroffheap(BAT *b, int *restrict offsets) +{ + long_str filename; + Heap h1; /* old offset heap */ + Heap h2; /* new string heap */ + Heap h3; /* new offset heap */ + Heap *h; /* string heap */ + int noffsets = 0; + const size_t extralen = b->tvheap->hashash ? EXTRALEN : 0; + size_t pos; + var_t emptyoff = 0; + const char *nme, *bnme; + char *srcdir; + BUN i; + int width; + int nofix = 1; + + assert(GDK_ELIMDOUBLES(b->tvheap)); + + nme = BBP_physical(b->batCacheid); + srcdir = GDKfilepath(NOFARM, BATDIR, nme, NULL); + if (srcdir == NULL) + GDKfatal("fixstroffheap: GDKmalloc failed\n"); + *strrchr(srcdir, DIR_SEP) = 0; + + /* load string heap */ + if (HEAPload(b->tvheap, nme, "theap", 0) != GDK_SUCCEED) + GDKfatal("fixstroffheap: loading string (theap) heap " + "for BAT %d failed\n", b->batCacheid); + h = b->tvheap; /* abbreviation */ + /* collect valid offsets */ + pos = GDK_STRHASHSIZE; + while (pos < h->free) { + const char *s; + size_t pad; + + pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1)); + if (pad < sizeof(stridx_t)) + pad += GDK_VARALIGN; + pos += pad + extralen; + s = h->base + pos; + if (*s == '\0') + emptyoff = (var_t) pos; + offsets[noffsets++] = (int) pos; /* < 65536, i.e. fits */ + pos += GDK_STRLEN(s); + } + HEAPfree(b->tvheap, 0); + + if ((bnme = strrchr(nme, DIR_SEP)) != NULL) + bnme++; + else + bnme = nme; + sprintf(filename, "BACKUP%c%s", DIR_SEP, bnme); + + width = b->twidth; + h2.dirty = 0; + if (emptyoff == 0) { + /* no legitimate empty string in the string heap; we + * now make a backup of the old string heap and create + * a new one to which we add an empty string */ + h2 = *b->tvheap; + if (GDKmove(h2.farmid, srcdir, bnme, "theap", BAKDIR, bnme, "theap") != GDK_SUCCEED) + GDKfatal("fixstroffheap: cannot make backup of %s.theap\n", nme); + h2.filename = GDKfilepath(NOFARM, NULL, nme, "theap"); + if (h2.filename == NULL) + GDKfatal("fixstroffheap: GDKmalloc failed\n"); + h2.base = NULL; + if (HEAPalloc(&h2, h2.size, 1) != GDK_SUCCEED) + GDKfatal("fixstroffheap: allocating new string heap " + "for BAT %d failed\n", b->batCacheid); + h2.cleanhash = b->tvheap->cleanhash; + h2.hashash = b->tvheap->hashash; + h2.free = b->tvheap->free; + /* load old offset heap and copy contents to new heap */ + h1 = *b->tvheap; + h1.filename = NULL; + h1.base = NULL; + h1.dirty = 0; + if (HEAPload(&h1, filename, "theap", 0) != GDK_SUCCEED) + GDKfatal("fixstroffheap: loading old tail heap " + "for BAT %d failed\n", b->batCacheid); + memcpy(h2.base, h1.base, h2.free); + HEAPfree(&h1, 0); + h2.dirty = 1; + if ((*BATatoms[TYPE_str].atomPut)(&h2, &emptyoff, "") == 0) + GDKfatal("fixstroffheap: cannot insert empty string " + "in BAT %d failed\n", b->batCacheid); + /* if the offset of the new empty string doesn't fit + * in the offset heap (too many bits for the current + * width), we will also make the new offset heap + * wider */ + if ((width <= 2 ? emptyoff - GDK_VAROFFSET : emptyoff) >= (var_t) (1 << (width * 8))) { + width <<= 1; + assert((width <= 2 ? emptyoff - GDK_VAROFFSET : emptyoff) < (var_t) (1 << (width * 8))); + } + } + + /* make backup of offset heap */ + if (GDKmove(b->theap.farmid, srcdir, bnme, "tail", BAKDIR, bnme, "tail") != GDK_SUCCEED) + GDKfatal("fixstroffheap: cannot make backup of %s.tail\n", nme); + /* load old offset heap */ + h1 = b->theap; + h1.filename = NULL; + h1.base = NULL; + h1.dirty = 0; + if (HEAPload(&h1, filename, "tail", 0) != GDK_SUCCEED) + GDKfatal("fixstroffheap: loading old tail heap " + "for BAT %d failed\n", b->batCacheid); + + /* create new offset heap */ + h3 = b->theap; + h3.filename = GDKfilepath(NOFARM, NULL, nme, "tail"); + if (h3.filename == NULL) + GDKfatal("fixstroffheap: GDKmalloc failed\n"); + if (HEAPalloc(&h3, b->batCapacity, width) != GDK_SUCCEED) + GDKfatal("fixstroffheap: allocating new tail heap " + "for BAT %d failed\n", b->batCacheid); + h3.dirty = TRUE; + h3.free = h1.free; + + switch (b->twidth) { + case 1: + for (i = 0; i < b->batCount; i++) { + pos = ((var_t) ((unsigned char *) h1.base)[i] + GDK_VAROFFSET) << GDK_VARSHIFT; + if (!offsearch(offsets, noffsets, (int) pos)) { + pos = emptyoff; + nofix = 0; + } + if (width == 1) + ((unsigned char *) h3.base)[i] = (unsigned char) ((pos >> GDK_VARSHIFT) - GDK_VAROFFSET); + else + ((unsigned short *) h3.base)[i] = (unsigned short) ((pos >> GDK_VARSHIFT) - GDK_VAROFFSET); + } + break; + case 2: + for (i = 0; i < b->batCount; i++) { + pos = ((var_t) ((unsigned short *) h1.base)[i] + GDK_VAROFFSET) << GDK_VARSHIFT; + if (!offsearch(offsets, noffsets, (int) pos)) { + pos = emptyoff; + nofix = 0; + } + if (width == 2) + ((unsigned short *) h3.base)[i] = (unsigned short) ((pos >> GDK_VARSHIFT) - GDK_VAROFFSET); + else + ((unsigned int *) h3.base)[i] = (unsigned int) ((pos >> GDK_VARSHIFT) - GDK_VAROFFSET); + } + break; + case 4: + for (i = 0; i < b->batCount; i++) { + pos = (var_t) ((unsigned int *) h1.base)[i] << GDK_VARSHIFT; + if (!offsearch(offsets, noffsets, (int) pos)) { + pos = emptyoff; + nofix = 0; + } + ((unsigned int *) h3.base)[i] = (unsigned int) (pos >> GDK_VARSHIFT); + } + break; +#if SIZEOF_VAR_T == 8 + case 8: + for (i = 0; i < b->batCount; i++) { + pos = (var_t) ((ulng *) h1.base)[i] << GDK_VARSHIFT; + if (!offsearch(offsets, noffsets, (int) pos)) { + pos = emptyoff; + nofix = 0; + } + ((ulng *) h3.base)[i] = (ulng) (pos >> GDK_VARSHIFT); + } + break; +#endif + default: + /* cannot happen */ + assert(0); + } + + /* cleanup */ + HEAPfree(&h1, 0); + if (nofix) { + /* didn't fix anything, move backups back */ + if (h2.dirty) { + HEAPfree(&h2, 1); + if (GDKmove(b->tvheap->farmid, BAKDIR, bnme, "theap", srcdir, bnme, "theap") != GDK_SUCCEED) + GDKfatal("fixstroffheap: cannot restore backup of %s.theap\n", nme); + } + HEAPfree(&h3, 1); + if (GDKmove(b->theap.farmid, BAKDIR, bnme, "tail", srcdir, bnme, "tail") != GDK_SUCCEED) + GDKfatal("fixstroffheap: cannot restore backup of %s.tail\n", nme); + } else { + /* offset heap was fixed */ + b->twidth = width; + b->batDirtydesc = 1; + if (h2.dirty) { + /* in addition, we added an empty string to + * the string heap */ + HEAPsave(&h2, nme, "theap"); + HEAPfree(&h2, 0); + *b->tvheap = h2; + } + HEAPsave(&h3, nme, "tail"); + HEAPfree(&h3, 0); + b->theap = h3; + } + GDKfree(srcdir); +} + +static void +fixstrbats(void) +{ + bat bid; + BAT *b; + int *offsets; + int tt; + + fprintf(stderr, + "# fixing string offset heaps\n"); + fflush(stderr); + + /* The minimum size a string occupies in the double-eliminated + * part of a string heap is SIZEOF_VAR_T (due to padding to _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list