Changeset: 99be915dc963 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=99be915dc963
Modified Files:
        gdk/gdk.h
        gdk/gdk_bat.c
        gdk/gdk_batop.c
        gdk/gdk_join.c
Branch: Jun2020
Log Message:

Cache estimated count of unique values; don't project sample in certain cases.


diffs (97 lines):

diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -2022,6 +2022,7 @@ enum prop_t {
        GDK_MAX_VALUE,          /* largest non-nil value in BAT */
        GDK_HASH_BUCKETS,       /* last used hash bucket size */
        GDK_NUNIQUE,            /* number of unique values */
+       GDK_UNIQUE_ESTIMATE,    /* estimate of number of distinct values */
 };
 
 /*
diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c
--- a/gdk/gdk_bat.c
+++ b/gdk/gdk_bat.c
@@ -1073,6 +1073,7 @@ BUNappend(BAT *b, const void *t, bool fo
        IMPSdestroy(b); /* no support for inserts in imprints yet */
        OIDXdestroy(b);
        BATrmprop(b, GDK_NUNIQUE);
+       BATrmprop(b, GDK_UNIQUE_ESTIMATE);
 #if 0          /* enable if we have more properties than just min/max */
        PROPrec *prop;
        do {
@@ -1159,6 +1160,7 @@ BUNdelete(BAT *b, oid o)
        OIDXdestroy(b);
        HASHdestroy(b);
        BATrmprop(b, GDK_NUNIQUE);
+       BATrmprop(b, GDK_UNIQUE_ESTIMATE);
 #if 0          /* enable if we have more properties than just min/max */
        do {
                for (prop = b->tprops; prop; prop = prop->next)
@@ -1248,6 +1250,7 @@ BUNinplace(BAT *b, BUN p, const void *t,
                        }
                }
                BATrmprop(b, GDK_NUNIQUE);
+               BATrmprop(b, GDK_UNIQUE_ESTIMATE);
 #if 0          /* enable if we have more properties than just min/max */
                do {
                        for (prop = b->tprops; prop; prop = prop->next)
diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c
--- a/gdk/gdk_batop.c
+++ b/gdk/gdk_batop.c
@@ -573,6 +573,7 @@ BATappend2(BAT *b, BAT *n, BAT *s, bool 
                }
        }
        BATrmprop(b, GDK_NUNIQUE);
+       BATrmprop(b, GDK_UNIQUE_ESTIMATE);
 #if 0          /* enable if we have more properties than just min/max */
        do {
                for (prop = b->tprops; prop; prop = prop->next)
@@ -890,6 +891,7 @@ BATreplace(BAT *b, BAT *p, BAT *n, bool 
        OIDXdestroy(b);
        IMPSdestroy(b);
        BATrmprop(b, GDK_NUNIQUE);
+       BATrmprop(b, GDK_UNIQUE_ESTIMATE);
 
        b->tsorted = b->trevsorted = false;
        b->tnosorted = b->tnorevsorted = 0;
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -3023,12 +3023,19 @@ guess_uniques(BAT *b, struct canditer *c
        if (b->tkey)
                return (double) ci->ncand;
 
-       if (ci->s) {
+       if (ci->s == NULL ||
+           (ci->tpe == cand_dense && ci->ncand == BATcount(b))) {
+               PROPrec *p = BATgetprop(b, GDK_UNIQUE_ESTIMATE);
+               if (p) {
+                       TRC_DEBUG(ALGO, "b=" ALGOBATFMT " use cached value\n",
+                                 ALGOBATPAR(b));
+                       return p->v.val.dval;
+               }
+               s1 = BATsample(b, 1000);
+       } else {
                BAT *s2 = BATsample(ci->s, 1000);
                s1 = BATproject(s2, ci->s);
                BBPreclaim(s2);
-       } else {
-               s1 = BATsample(b, 1000);
        }
        BUN n2 = BATcount(s1);
        BUN n1 = n2 / 2;
@@ -3038,7 +3045,12 @@ guess_uniques(BAT *b, struct canditer *c
        double A = (double) (cnt2 - cnt1) / (n2 - n1);
        double B = cnt1 - n1 * A;
 
-       return A * ci->ncand + B;
+       B += A * ci->ncand;
+       if (ci->s == NULL ||
+           (ci->tpe == cand_dense && ci->ncand == BATcount(b))) {
+               BATsetprop(b, GDK_UNIQUE_ESTIMATE, TYPE_dbl, &B);
+       }
+       return B;
 }
 
 #define MASK_EQ                1
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to