Changeset: eabe0b36be21 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/eabe0b36be21
Modified Files:
        gdk/gdk.h
        gdk/gdk_bat.c
        gdk/gdk_batop.c
        gdk/gdk_hash.c
        gdk/gdk_private.h
        gdk/gdk_select.c
        gdk/gdk_unique.c
        sql/server/rel_schema.c
        sql/storage/bat/bat_logger.c
        sql/test/emptydb-upgrade-chain-hge/Tests/upgrade.stable.out.int128
        sql/test/emptydb-upgrade-chain/Tests/upgrade.stable.out
        sql/test/emptydb-upgrade-chain/Tests/upgrade.stable.out.int128
        sql/test/emptydb/Tests/check.stable.out
        sql/test/emptydb/Tests/check.stable.out.32bit
        sql/test/emptydb/Tests/check.stable.out.int128
        sql/test/testdb-upgrade-chain-hge/Tests/upgrade.stable.out.int128
        sql/test/testdb-upgrade-chain/Tests/upgrade.stable.out
        sql/test/testdb-upgrade-chain/Tests/upgrade.stable.out.int128
        sql/test/testdb-upgrade-hge/Tests/upgrade.stable.out.int128
        sql/test/testdb-upgrade/Tests/upgrade.stable.out
        sql/test/testdb-upgrade/Tests/upgrade.stable.out.32bit
        sql/test/testdb-upgrade/Tests/upgrade.stable.out.int128
Branch: default
Log Message:

Merge with Oct2020 branch.


diffs (truncated from 57385 to 300 lines):

diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -2243,13 +2243,14 @@ gdk_export void VIEWbounds(BAT *b, BAT *
  */
 enum prop_t {
        GDK_MIN_VALUE = 3,      /* smallest non-nil value in BAT */
-       GDK_MIN_POS,            /* BUN position of smallest value  */
+       GDK_MIN_POS,            /* BUN position of smallest value (oid) */
        GDK_MAX_VALUE,          /* largest non-nil value in BAT */
-       GDK_MAX_POS,            /* BUN position of largest value  */
-       GDK_HASH_BUCKETS,       /* last used hash bucket size */
-       GDK_NUNIQUE,            /* number of unique values */
-       GDK_UNIQUE_ESTIMATE,    /* estimate of number of distinct values */
+       GDK_MAX_POS,            /* BUN position of largest value (oid) */
+       GDK_HASH_BUCKETS,       /* last used hash bucket size (oid) */
+       GDK_NUNIQUE,            /* number of unique values (oid) */
+       GDK_UNIQUE_ESTIMATE,    /* estimate of number of distinct values (dbl) 
*/
 };
+
 gdk_export ValPtr BATgetprop(BAT *b, enum prop_t idx);
 
 /*
diff --git a/gdk/gdk_align.c b/gdk/gdk_align.c
--- a/gdk/gdk_align.c
+++ b/gdk/gdk_align.c
@@ -255,6 +255,7 @@ BATmaterialize(BAT *b)
        b->tbaseoff = 0;
        b->theap->dirty = true;
        BATsetprop_nolock(b, GDK_NUNIQUE, TYPE_oid, &(oid){is_oid_nil(t) ? 1 : 
b->batCount});
+       BATsetprop_nolock(b, GDK_UNIQUE_ESTIMATE, TYPE_dbl, 
&(dbl){is_oid_nil(t) ? 1 : b->batCount});
        MT_lock_unset(&b->theaplock);
        b->ttype = TYPE_oid;
        BATsetdims(b);
diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c
--- a/gdk/gdk_bat.c
+++ b/gdk/gdk_bat.c
@@ -674,6 +674,7 @@ BATfree(BAT *b)
        MT_lock_set(&b->theaplock);
        if (nunique != BUN_NONE) {
                BATsetprop_nolock(b, GDK_NUNIQUE, TYPE_oid, &(oid){nunique});
+               BATsetprop_nolock(b, GDK_UNIQUE_ESTIMATE, TYPE_dbl, 
&(dbl){nunique});
                BATsetprop_nolock(b, GDK_HASH_BUCKETS, TYPE_oid, 
&(oid){nbucket});
        }
        if (b->theap) {
@@ -1041,7 +1042,8 @@ BUNappendmulti(BAT *b, const void *value
                        return rc;
        }
 
-       BATrmprop(b, GDK_UNIQUE_ESTIMATE);
+       if (count > BATcount(b) / GDK_UNIQUE_ESTIMATE_KEEP_FRACTION)
+               BATrmprop(b, GDK_UNIQUE_ESTIMATE);
        b->theap->dirty = true;
        const void *t = b->ttype == TYPE_msk ? &(msk){false} : 
ATOMnilptr(b->ttype);
        if (b->ttype == TYPE_oid) {
@@ -1277,7 +1279,8 @@ BUNdelete(BAT *b, oid o)
                b->tnorevsorted = 0;
        MT_lock_set(&b->theaplock);
        b->batCount--;
-       BATrmprop_nolock(b, GDK_UNIQUE_ESTIMATE);
+       if (BATcount(b) < GDK_UNIQUE_ESTIMATE_KEEP_FRACTION)
+               BATrmprop_nolock(b, GDK_UNIQUE_ESTIMATE);
        MT_lock_unset(&b->theaplock);
        if (b->batCount <= 1) {
                /* some trivial properties */
@@ -1405,7 +1408,8 @@ BUNinplacemulti(BAT *b, const oid *posit
                                } else {
                                        BATrmprop_nolock(b, GDK_MIN_POS);
                                }
-                               BATrmprop_nolock(b, GDK_UNIQUE_ESTIMATE);
+                               if (count > BATcount(b) / 
GDK_UNIQUE_ESTIMATE_KEEP_FRACTION)
+                                       BATrmprop_nolock(b, 
GDK_UNIQUE_ESTIMATE);
                                MT_lock_unset(&b->theaplock);
                        } else {
                                PROPdestroy(b);
diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c
--- a/gdk/gdk_batop.c
+++ b/gdk/gdk_batop.c
@@ -787,7 +787,8 @@ BATappend2(BAT *b, BAT *n, BAT *s, bool 
                        BATrmprop(b, GDK_MIN_POS);
                }
        }
-       BATrmprop(b, GDK_UNIQUE_ESTIMATE);
+       if (cnt > BATcount(b) / GDK_UNIQUE_ESTIMATE_KEEP_FRACTION)
+               BATrmprop(b, GDK_UNIQUE_ESTIMATE);
        /* load hash so that we can maintain it */
        (void) BATcheckhash(b);
 
@@ -1163,7 +1164,8 @@ BATappend_or_update(BAT *b, BAT *p, cons
 
        OIDXdestroy(b);
        IMPSdestroy(b);
-       BATrmprop(b, GDK_UNIQUE_ESTIMATE);
+       if (ni.count > BATcount(b) / GDK_UNIQUE_ESTIMATE_KEEP_FRACTION)
+               BATrmprop(b, GDK_UNIQUE_ESTIMATE);
        /* load hash so that we can maintain it */
        (void) BATcheckhash(b);
 
diff --git a/gdk/gdk_hash.c b/gdk/gdk_hash.c
--- a/gdk/gdk_hash.c
+++ b/gdk/gdk_hash.c
@@ -818,6 +818,9 @@ BAThash_impl(BAT *restrict b, struct can
                maxmask = HASHmask(ci->ncand);
                if (mask > maxmask)
                        mask = maxmask;
+       } else if (!hascand && (prop = BATgetprop_try(b, GDK_UNIQUE_ESTIMATE)) 
!= NULL) {
+               assert(prop->vtype == TYPE_dbl);
+               mask = (BUN) (prop->val.dval * 8 / 7);
        } else {
                /* dynamic hash: we start with HASHmask(ci->ncand)/64, or,
                 * if ci->ncand large enough, HASHmask(ci->ncand)/256; if there
@@ -981,6 +984,9 @@ BAThash_impl(BAT *restrict b, struct can
        }
        bat_iterator_end(&bi);
        if (!hascand) {
+               /* don't keep these properties while we have a hash
+                * structure: they get added again when the hash is
+                * freed */
                MT_lock_set(&b->theaplock);
                BATrmprop_nolock(b, GDK_HASH_BUCKETS);
                BATrmprop_nolock(b, GDK_NUNIQUE);
@@ -1120,9 +1126,14 @@ HASHappend_locked(BAT *b, BUN i, const v
                return;
        }
        assert(i * h->width == h->heaplink.free);
-       if (HASHfix(h, false, true) != GDK_SUCCEED) {
+       if (h->nunique < b->batCount / HASH_DESTROY_UNIQUES_FRACTION) {
+               b->thash = NULL;
                doHASHdestroy(b, h);
+               return;
+       }
+       if (HASHfix(h, false, true) != GDK_SUCCEED) {
                b->thash = NULL;
+               doHASHdestroy(b, h);
                return;
        }
        if (HASHwidth(i + 1) > h->width &&
@@ -1184,9 +1195,14 @@ HASHinsert_locked(BAT *b, BUN p, const v
                return;
        }
        assert(p * h->width < h->heaplink.free);
-       if (HASHfix(h, false, true) != GDK_SUCCEED) {
+       if (h->nunique < b->batCount / HASH_DESTROY_UNIQUES_FRACTION) {
+               b->thash = NULL;
                doHASHdestroy(b, h);
+               return;
+       }
+       if (HASHfix(h, false, true) != GDK_SUCCEED) {
                b->thash = NULL;
+               doHASHdestroy(b, h);
                return;
        }
        BUN c = HASHprobe(h, v);
@@ -1260,9 +1276,14 @@ HASHdelete_locked(BAT *b, BUN p, const v
                return;
        }
        assert(p * h->width < h->heaplink.free);
-       if (HASHfix(h, false, true) != GDK_SUCCEED) {
+       if (h->nunique < b->batCount / HASH_DESTROY_UNIQUES_FRACTION) {
+               b->thash = NULL;
                doHASHdestroy(b, h);
+               return;
+       }
+       if (HASHfix(h, false, true) != GDK_SUCCEED) {
                b->thash = NULL;
+               doHASHdestroy(b, h);
                return;
        }
        BUN c = HASHprobe(h, v);
diff --git a/gdk/gdk_logger.c b/gdk/gdk_logger.c
--- a/gdk/gdk_logger.c
+++ b/gdk/gdk_logger.c
@@ -342,11 +342,13 @@ string_reader(logger *lg, BAT *b, lng nr
                sz = (size_t)SZ;
                char *buf = lg->buf;
                if (lg->bufsize < sz) {
-                       lg->buf = buf = GDKrealloc(buf, sz);
+                       if (!(buf = GDKrealloc(lg->buf, sz)))
+                               return LOG_ERR;
+                       lg->buf = buf;
                        lg->bufsize = sz;
                }
 
-               if (!buf || mnstr_read(lg->input_log, buf, sz, 1) != 1)
+               if (mnstr_read(lg->input_log, buf, sz, 1) != 1)
                        return LOG_EOF;
                /* handle strings */
                char *t = buf;
@@ -2363,7 +2365,7 @@ string_writer(logger *lg, BAT *b, lng of
        size_t bufsz = lg->bufsize, resize = 0;
        BUN end = (BUN)(offset + nr);
        char *buf = lg->buf;
-       gdk_return res = GDK_FAIL;
+       gdk_return res = GDK_SUCCEED;
 
        if (!buf)
                return GDK_FAIL;
@@ -2372,11 +2374,11 @@ string_writer(logger *lg, BAT *b, lng of
        for ( ; p < end; ) {
                size_t sz = 0;
                if (resize) {
-                       lg->buf = buf = GDKrealloc(buf, resize);
-                       if (!buf) {
+                       if (!(buf = GDKrealloc(lg->buf, resize))) {
                                res = GDK_FAIL;
                                break;
                        }
+                       lg->buf = buf;
                        lg->bufsize = bufsz = resize;
                        resize = 0;
                }
@@ -2394,8 +2396,10 @@ string_writer(logger *lg, BAT *b, lng of
                                sz += len;
                        }
                }
-               if (sz && buf && mnstr_writeLng(lg->output_log, (lng) sz) && 
mnstr_write(lg->output_log, buf, sz, 1) == 1)
-                       res = GDK_SUCCEED;
+               if (sz && (!mnstr_writeLng(lg->output_log, (lng) sz) || 
mnstr_write(lg->output_log, buf, sz, 1) != 1)) {
+                       res = GDK_FAIL;
+                       break;
+               }
        }
        bat_iterator_end(&bi);
        return res;
diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -463,6 +463,16 @@ extern MT_Lock GDKtmLock;
 #define GDKcacheLock(y)        GDKbbpLock[y].cache
 #define BBP_free(y)    GDKbbpLock[y].free
 
+/* when the number of updates to a BAT is less than 1 in this number, we
+ * keep the GDK_UNIQUE_ESTIMATE property */
+extern BUN GDK_UNIQUE_ESTIMATE_KEEP_FRACTION; /* should become a define once */
+/* if the number of unique values is less than 1 in this number, we
+ * destroy the hash rather than update it in HASH{append,insert,delete} */
+extern BUN HASH_DESTROY_UNIQUES_FRACTION;     /* likewise */
+/* if the estimated number of unique values is less than 1 in this
+ * number, don't build a hash table to do a hashselect */
+extern dbl NO_HASH_SELECT_FRACTION;           /* same here */
+
 #if !defined(NDEBUG) && !defined(__COVERITY__)
 /* see comment in gdk.h */
 #ifdef __GNUC__
diff --git a/gdk/gdk_select.c b/gdk/gdk_select.c
--- a/gdk/gdk_select.c
+++ b/gdk/gdk_select.c
@@ -1576,6 +1576,14 @@ BATselect(BAT *b, BAT *s, const void *tl
                        (!b->batTransient &&
                         ATOMsize(b->ttype) >= sizeof(BUN) / 4 &&
                         BATcount(b) * (ATOMsize(b->ttype) + 2 * sizeof(BUN)) < 
GDK_mem_maxsize / 2);
+               if (wanthash && !havehash) {
+                       const ValRecord *prop;
+                       if ((prop = BATgetprop(b, GDK_UNIQUE_ESTIMATE)) != NULL 
&&
+                           prop->val.dval < BATcount(b) / 
NO_HASH_SELECT_FRACTION) {
+                               /* too many duplicates: not worth it */
+                               wanthash = false;
+                       }
+               }
        }
 
        if (equi && !havehash && parent != 0) {
diff --git a/gdk/gdk_unique.c b/gdk/gdk_unique.c
--- a/gdk/gdk_unique.c
+++ b/gdk/gdk_unique.c
@@ -92,6 +92,8 @@ BATunique(BAT *b, BAT *s)
                        MT_lock_set(&b->theaplock);
                        if ((prop = BATgetprop_nolock(b, GDK_NUNIQUE)) != NULL)
                                initsize = prop->val.oval;
+                       else if ((prop = BATgetprop_nolock(b, 
GDK_UNIQUE_ESTIMATE)) != NULL)
+                               initsize = (BUN) prop->val.dval;
                        MT_lock_unset(&b->theaplock);
                }
        }
diff --git a/gdk/gdk_utils.c b/gdk/gdk_utils.c
--- a/gdk/gdk_utils.c
+++ b/gdk/gdk_utils.c
@@ -57,6 +57,16 @@ static void GDKunlockHome(int farmid);
 #undef realloc
 #undef free
 
+/* when the number of updates to a BAT is less than 1 in this number, we
+ * keep the GDK_UNIQUE_ESTIMATE property */
+BUN GDK_UNIQUE_ESTIMATE_KEEP_FRACTION = 1000; /* should become a define once */
+/* if the number of unique values is less than 1 in this number, we
+ * destroy the hash rather than update it in HASH{append,insert,delete} */
+BUN HASH_DESTROY_UNIQUES_FRACTION = 1000;     /* likewise */
+/* if the estimated number of unique values is less than 1 in this
+ * number, don't build a hash table to do a hashselect */
+dbl NO_HASH_SELECT_FRACTION = 1000;           /* same here */
+
 /*
  * @+ Monet configuration file
  * Parse a possible MonetDB config file (if specified by command line
@@ -1146,6 +1156,21 @@ GDKinit(opt *set, int setlen, bool embed
                TRC_CRITICAL(GDK, "GDKsetenv revision failed");
                return GDK_FAIL;
        }
+       GDK_UNIQUE_ESTIMATE_KEEP_FRACTION = 0;
+       if ((p = GDKgetenv("gdk_unique_estimate_keep_fraction")) != NULL)
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to