Changeset: 99be915dc963 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=99be915dc963 Modified Files: gdk/gdk.h gdk/gdk_bat.c gdk/gdk_batop.c gdk/gdk_join.c Branch: Jun2020 Log Message:
Cache estimated count of unique values; don't project sample in certain cases. diffs (97 lines): diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -2022,6 +2022,7 @@ enum prop_t { GDK_MAX_VALUE, /* largest non-nil value in BAT */ GDK_HASH_BUCKETS, /* last used hash bucket size */ GDK_NUNIQUE, /* number of unique values */ + GDK_UNIQUE_ESTIMATE, /* estimate of number of distinct values */ }; /* diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c --- a/gdk/gdk_bat.c +++ b/gdk/gdk_bat.c @@ -1073,6 +1073,7 @@ BUNappend(BAT *b, const void *t, bool fo IMPSdestroy(b); /* no support for inserts in imprints yet */ OIDXdestroy(b); BATrmprop(b, GDK_NUNIQUE); + BATrmprop(b, GDK_UNIQUE_ESTIMATE); #if 0 /* enable if we have more properties than just min/max */ PROPrec *prop; do { @@ -1159,6 +1160,7 @@ BUNdelete(BAT *b, oid o) OIDXdestroy(b); HASHdestroy(b); BATrmprop(b, GDK_NUNIQUE); + BATrmprop(b, GDK_UNIQUE_ESTIMATE); #if 0 /* enable if we have more properties than just min/max */ do { for (prop = b->tprops; prop; prop = prop->next) @@ -1248,6 +1250,7 @@ BUNinplace(BAT *b, BUN p, const void *t, } } BATrmprop(b, GDK_NUNIQUE); + BATrmprop(b, GDK_UNIQUE_ESTIMATE); #if 0 /* enable if we have more properties than just min/max */ do { for (prop = b->tprops; prop; prop = prop->next) diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c --- a/gdk/gdk_batop.c +++ b/gdk/gdk_batop.c @@ -573,6 +573,7 @@ BATappend2(BAT *b, BAT *n, BAT *s, bool } } BATrmprop(b, GDK_NUNIQUE); + BATrmprop(b, GDK_UNIQUE_ESTIMATE); #if 0 /* enable if we have more properties than just min/max */ do { for (prop = b->tprops; prop; prop = prop->next) @@ -890,6 +891,7 @@ BATreplace(BAT *b, BAT *p, BAT *n, bool OIDXdestroy(b); IMPSdestroy(b); BATrmprop(b, GDK_NUNIQUE); + BATrmprop(b, GDK_UNIQUE_ESTIMATE); b->tsorted = b->trevsorted = false; b->tnosorted = b->tnorevsorted = 0; diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c --- a/gdk/gdk_join.c +++ b/gdk/gdk_join.c @@ -3023,12 +3023,19 @@ guess_uniques(BAT *b, struct canditer *c if (b->tkey) return (double) ci->ncand; - if (ci->s) { + if (ci->s == NULL || + (ci->tpe == cand_dense && ci->ncand == BATcount(b))) { + PROPrec *p = BATgetprop(b, GDK_UNIQUE_ESTIMATE); + if (p) { + TRC_DEBUG(ALGO, "b=" ALGOBATFMT " use cached value\n", + ALGOBATPAR(b)); + return p->v.val.dval; + } + s1 = BATsample(b, 1000); + } else { BAT *s2 = BATsample(ci->s, 1000); s1 = BATproject(s2, ci->s); BBPreclaim(s2); - } else { - s1 = BATsample(b, 1000); } BUN n2 = BATcount(s1); BUN n1 = n2 / 2; @@ -3038,7 +3045,12 @@ guess_uniques(BAT *b, struct canditer *c double A = (double) (cnt2 - cnt1) / (n2 - n1); double B = cnt1 - n1 * A; - return A * ci->ncand + B; + B += A * ci->ncand; + if (ci->s == NULL || + (ci->tpe == cand_dense && ci->ncand == BATcount(b))) { + BATsetprop(b, GDK_UNIQUE_ESTIMATE, TYPE_dbl, &B); + } + return B; } #define MASK_EQ 1 _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list