Changeset: eabe0b36be21 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/eabe0b36be21 Modified Files: gdk/gdk.h gdk/gdk_bat.c gdk/gdk_batop.c gdk/gdk_hash.c gdk/gdk_private.h gdk/gdk_select.c gdk/gdk_unique.c sql/server/rel_schema.c sql/storage/bat/bat_logger.c sql/test/emptydb-upgrade-chain-hge/Tests/upgrade.stable.out.int128 sql/test/emptydb-upgrade-chain/Tests/upgrade.stable.out sql/test/emptydb-upgrade-chain/Tests/upgrade.stable.out.int128 sql/test/emptydb/Tests/check.stable.out sql/test/emptydb/Tests/check.stable.out.32bit sql/test/emptydb/Tests/check.stable.out.int128 sql/test/testdb-upgrade-chain-hge/Tests/upgrade.stable.out.int128 sql/test/testdb-upgrade-chain/Tests/upgrade.stable.out sql/test/testdb-upgrade-chain/Tests/upgrade.stable.out.int128 sql/test/testdb-upgrade-hge/Tests/upgrade.stable.out.int128 sql/test/testdb-upgrade/Tests/upgrade.stable.out sql/test/testdb-upgrade/Tests/upgrade.stable.out.32bit sql/test/testdb-upgrade/Tests/upgrade.stable.out.int128 Branch: default Log Message:
Merge with Oct2020 branch. diffs (truncated from 57385 to 300 lines): diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -2243,13 +2243,14 @@ gdk_export void VIEWbounds(BAT *b, BAT * */ enum prop_t { GDK_MIN_VALUE = 3, /* smallest non-nil value in BAT */ - GDK_MIN_POS, /* BUN position of smallest value */ + GDK_MIN_POS, /* BUN position of smallest value (oid) */ GDK_MAX_VALUE, /* largest non-nil value in BAT */ - GDK_MAX_POS, /* BUN position of largest value */ - GDK_HASH_BUCKETS, /* last used hash bucket size */ - GDK_NUNIQUE, /* number of unique values */ - GDK_UNIQUE_ESTIMATE, /* estimate of number of distinct values */ + GDK_MAX_POS, /* BUN position of largest value (oid) */ + GDK_HASH_BUCKETS, /* last used hash bucket size (oid) */ + GDK_NUNIQUE, /* number of unique values (oid) */ + GDK_UNIQUE_ESTIMATE, /* estimate of number of distinct values (dbl) */ }; + gdk_export ValPtr BATgetprop(BAT *b, enum prop_t idx); /* diff --git a/gdk/gdk_align.c b/gdk/gdk_align.c --- a/gdk/gdk_align.c +++ b/gdk/gdk_align.c @@ -255,6 +255,7 @@ BATmaterialize(BAT *b) b->tbaseoff = 0; b->theap->dirty = true; BATsetprop_nolock(b, GDK_NUNIQUE, TYPE_oid, &(oid){is_oid_nil(t) ? 1 : b->batCount}); + BATsetprop_nolock(b, GDK_UNIQUE_ESTIMATE, TYPE_dbl, &(dbl){is_oid_nil(t) ? 1 : b->batCount}); MT_lock_unset(&b->theaplock); b->ttype = TYPE_oid; BATsetdims(b); diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c --- a/gdk/gdk_bat.c +++ b/gdk/gdk_bat.c @@ -674,6 +674,7 @@ BATfree(BAT *b) MT_lock_set(&b->theaplock); if (nunique != BUN_NONE) { BATsetprop_nolock(b, GDK_NUNIQUE, TYPE_oid, &(oid){nunique}); + BATsetprop_nolock(b, GDK_UNIQUE_ESTIMATE, TYPE_dbl, &(dbl){nunique}); BATsetprop_nolock(b, GDK_HASH_BUCKETS, TYPE_oid, &(oid){nbucket}); } if (b->theap) { @@ -1041,7 +1042,8 @@ BUNappendmulti(BAT *b, const void *value return rc; } - BATrmprop(b, GDK_UNIQUE_ESTIMATE); + if (count > BATcount(b) / GDK_UNIQUE_ESTIMATE_KEEP_FRACTION) + BATrmprop(b, GDK_UNIQUE_ESTIMATE); b->theap->dirty = true; const void *t = b->ttype == TYPE_msk ? &(msk){false} : ATOMnilptr(b->ttype); if (b->ttype == TYPE_oid) { @@ -1277,7 +1279,8 @@ BUNdelete(BAT *b, oid o) b->tnorevsorted = 0; MT_lock_set(&b->theaplock); b->batCount--; - BATrmprop_nolock(b, GDK_UNIQUE_ESTIMATE); + if (BATcount(b) < GDK_UNIQUE_ESTIMATE_KEEP_FRACTION) + BATrmprop_nolock(b, GDK_UNIQUE_ESTIMATE); MT_lock_unset(&b->theaplock); if (b->batCount <= 1) { /* some trivial properties */ @@ -1405,7 +1408,8 @@ BUNinplacemulti(BAT *b, const oid *posit } else { BATrmprop_nolock(b, GDK_MIN_POS); } - BATrmprop_nolock(b, GDK_UNIQUE_ESTIMATE); + if (count > BATcount(b) / GDK_UNIQUE_ESTIMATE_KEEP_FRACTION) + BATrmprop_nolock(b, GDK_UNIQUE_ESTIMATE); MT_lock_unset(&b->theaplock); } else { PROPdestroy(b); diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c --- a/gdk/gdk_batop.c +++ b/gdk/gdk_batop.c @@ -787,7 +787,8 @@ BATappend2(BAT *b, BAT *n, BAT *s, bool BATrmprop(b, GDK_MIN_POS); } } - BATrmprop(b, GDK_UNIQUE_ESTIMATE); + if (cnt > BATcount(b) / GDK_UNIQUE_ESTIMATE_KEEP_FRACTION) + BATrmprop(b, GDK_UNIQUE_ESTIMATE); /* load hash so that we can maintain it */ (void) BATcheckhash(b); @@ -1163,7 +1164,8 @@ BATappend_or_update(BAT *b, BAT *p, cons OIDXdestroy(b); IMPSdestroy(b); - BATrmprop(b, GDK_UNIQUE_ESTIMATE); + if (ni.count > BATcount(b) / GDK_UNIQUE_ESTIMATE_KEEP_FRACTION) + BATrmprop(b, GDK_UNIQUE_ESTIMATE); /* load hash so that we can maintain it */ (void) BATcheckhash(b); diff --git a/gdk/gdk_hash.c b/gdk/gdk_hash.c --- a/gdk/gdk_hash.c +++ b/gdk/gdk_hash.c @@ -818,6 +818,9 @@ BAThash_impl(BAT *restrict b, struct can maxmask = HASHmask(ci->ncand); if (mask > maxmask) mask = maxmask; + } else if (!hascand && (prop = BATgetprop_try(b, GDK_UNIQUE_ESTIMATE)) != NULL) { + assert(prop->vtype == TYPE_dbl); + mask = (BUN) (prop->val.dval * 8 / 7); } else { /* dynamic hash: we start with HASHmask(ci->ncand)/64, or, * if ci->ncand large enough, HASHmask(ci->ncand)/256; if there @@ -981,6 +984,9 @@ BAThash_impl(BAT *restrict b, struct can } bat_iterator_end(&bi); if (!hascand) { + /* don't keep these properties while we have a hash + * structure: they get added again when the hash is + * freed */ MT_lock_set(&b->theaplock); BATrmprop_nolock(b, GDK_HASH_BUCKETS); BATrmprop_nolock(b, GDK_NUNIQUE); @@ -1120,9 +1126,14 @@ HASHappend_locked(BAT *b, BUN i, const v return; } assert(i * h->width == h->heaplink.free); - if (HASHfix(h, false, true) != GDK_SUCCEED) { + if (h->nunique < b->batCount / HASH_DESTROY_UNIQUES_FRACTION) { + b->thash = NULL; doHASHdestroy(b, h); + return; + } + if (HASHfix(h, false, true) != GDK_SUCCEED) { b->thash = NULL; + doHASHdestroy(b, h); return; } if (HASHwidth(i + 1) > h->width && @@ -1184,9 +1195,14 @@ HASHinsert_locked(BAT *b, BUN p, const v return; } assert(p * h->width < h->heaplink.free); - if (HASHfix(h, false, true) != GDK_SUCCEED) { + if (h->nunique < b->batCount / HASH_DESTROY_UNIQUES_FRACTION) { + b->thash = NULL; doHASHdestroy(b, h); + return; + } + if (HASHfix(h, false, true) != GDK_SUCCEED) { b->thash = NULL; + doHASHdestroy(b, h); return; } BUN c = HASHprobe(h, v); @@ -1260,9 +1276,14 @@ HASHdelete_locked(BAT *b, BUN p, const v return; } assert(p * h->width < h->heaplink.free); - if (HASHfix(h, false, true) != GDK_SUCCEED) { + if (h->nunique < b->batCount / HASH_DESTROY_UNIQUES_FRACTION) { + b->thash = NULL; doHASHdestroy(b, h); + return; + } + if (HASHfix(h, false, true) != GDK_SUCCEED) { b->thash = NULL; + doHASHdestroy(b, h); return; } BUN c = HASHprobe(h, v); diff --git a/gdk/gdk_logger.c b/gdk/gdk_logger.c --- a/gdk/gdk_logger.c +++ b/gdk/gdk_logger.c @@ -342,11 +342,13 @@ string_reader(logger *lg, BAT *b, lng nr sz = (size_t)SZ; char *buf = lg->buf; if (lg->bufsize < sz) { - lg->buf = buf = GDKrealloc(buf, sz); + if (!(buf = GDKrealloc(lg->buf, sz))) + return LOG_ERR; + lg->buf = buf; lg->bufsize = sz; } - if (!buf || mnstr_read(lg->input_log, buf, sz, 1) != 1) + if (mnstr_read(lg->input_log, buf, sz, 1) != 1) return LOG_EOF; /* handle strings */ char *t = buf; @@ -2363,7 +2365,7 @@ string_writer(logger *lg, BAT *b, lng of size_t bufsz = lg->bufsize, resize = 0; BUN end = (BUN)(offset + nr); char *buf = lg->buf; - gdk_return res = GDK_FAIL; + gdk_return res = GDK_SUCCEED; if (!buf) return GDK_FAIL; @@ -2372,11 +2374,11 @@ string_writer(logger *lg, BAT *b, lng of for ( ; p < end; ) { size_t sz = 0; if (resize) { - lg->buf = buf = GDKrealloc(buf, resize); - if (!buf) { + if (!(buf = GDKrealloc(lg->buf, resize))) { res = GDK_FAIL; break; } + lg->buf = buf; lg->bufsize = bufsz = resize; resize = 0; } @@ -2394,8 +2396,10 @@ string_writer(logger *lg, BAT *b, lng of sz += len; } } - if (sz && buf && mnstr_writeLng(lg->output_log, (lng) sz) && mnstr_write(lg->output_log, buf, sz, 1) == 1) - res = GDK_SUCCEED; + if (sz && (!mnstr_writeLng(lg->output_log, (lng) sz) || mnstr_write(lg->output_log, buf, sz, 1) != 1)) { + res = GDK_FAIL; + break; + } } bat_iterator_end(&bi); return res; diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h --- a/gdk/gdk_private.h +++ b/gdk/gdk_private.h @@ -463,6 +463,16 @@ extern MT_Lock GDKtmLock; #define GDKcacheLock(y) GDKbbpLock[y].cache #define BBP_free(y) GDKbbpLock[y].free +/* when the number of updates to a BAT is less than 1 in this number, we + * keep the GDK_UNIQUE_ESTIMATE property */ +extern BUN GDK_UNIQUE_ESTIMATE_KEEP_FRACTION; /* should become a define once */ +/* if the number of unique values is less than 1 in this number, we + * destroy the hash rather than update it in HASH{append,insert,delete} */ +extern BUN HASH_DESTROY_UNIQUES_FRACTION; /* likewise */ +/* if the estimated number of unique values is less than 1 in this + * number, don't build a hash table to do a hashselect */ +extern dbl NO_HASH_SELECT_FRACTION; /* same here */ + #if !defined(NDEBUG) && !defined(__COVERITY__) /* see comment in gdk.h */ #ifdef __GNUC__ diff --git a/gdk/gdk_select.c b/gdk/gdk_select.c --- a/gdk/gdk_select.c +++ b/gdk/gdk_select.c @@ -1576,6 +1576,14 @@ BATselect(BAT *b, BAT *s, const void *tl (!b->batTransient && ATOMsize(b->ttype) >= sizeof(BUN) / 4 && BATcount(b) * (ATOMsize(b->ttype) + 2 * sizeof(BUN)) < GDK_mem_maxsize / 2); + if (wanthash && !havehash) { + const ValRecord *prop; + if ((prop = BATgetprop(b, GDK_UNIQUE_ESTIMATE)) != NULL && + prop->val.dval < BATcount(b) / NO_HASH_SELECT_FRACTION) { + /* too many duplicates: not worth it */ + wanthash = false; + } + } } if (equi && !havehash && parent != 0) { diff --git a/gdk/gdk_unique.c b/gdk/gdk_unique.c --- a/gdk/gdk_unique.c +++ b/gdk/gdk_unique.c @@ -92,6 +92,8 @@ BATunique(BAT *b, BAT *s) MT_lock_set(&b->theaplock); if ((prop = BATgetprop_nolock(b, GDK_NUNIQUE)) != NULL) initsize = prop->val.oval; + else if ((prop = BATgetprop_nolock(b, GDK_UNIQUE_ESTIMATE)) != NULL) + initsize = (BUN) prop->val.dval; MT_lock_unset(&b->theaplock); } } diff --git a/gdk/gdk_utils.c b/gdk/gdk_utils.c --- a/gdk/gdk_utils.c +++ b/gdk/gdk_utils.c @@ -57,6 +57,16 @@ static void GDKunlockHome(int farmid); #undef realloc #undef free +/* when the number of updates to a BAT is less than 1 in this number, we + * keep the GDK_UNIQUE_ESTIMATE property */ +BUN GDK_UNIQUE_ESTIMATE_KEEP_FRACTION = 1000; /* should become a define once */ +/* if the number of unique values is less than 1 in this number, we + * destroy the hash rather than update it in HASH{append,insert,delete} */ +BUN HASH_DESTROY_UNIQUES_FRACTION = 1000; /* likewise */ +/* if the estimated number of unique values is less than 1 in this + * number, don't build a hash table to do a hashselect */ +dbl NO_HASH_SELECT_FRACTION = 1000; /* same here */ + /* * @+ Monet configuration file * Parse a possible MonetDB config file (if specified by command line @@ -1146,6 +1156,21 @@ GDKinit(opt *set, int setlen, bool embed TRC_CRITICAL(GDK, "GDKsetenv revision failed"); return GDK_FAIL; } + GDK_UNIQUE_ESTIMATE_KEEP_FRACTION = 0; + if ((p = GDKgetenv("gdk_unique_estimate_keep_fraction")) != NULL) _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list