Changeset: 56f633d64ef4 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=56f633d64ef4
Removed Files:
        sql/test/miscellaneous/Tests/simple_plans.stable.out.single
Modified Files:
        gdk/gdk_join.c
        sql/backends/monet5/sql_result.c
        sql/backends/monet5/sql_round_impl.h
        sql/backends/monet5/sql_upgrades.c
        sql/server/rel_optimizer.c
        sql/server/sql_parser.y
        sql/test/emptydb-upgrade-chain-hge/Tests/upgrade.stable.out.int128
        sql/test/emptydb-upgrade-chain-hge/Tests/upgrade.stable.out.ppc64.int128
        sql/test/emptydb-upgrade-chain/Tests/upgrade.stable.out
        sql/test/emptydb-upgrade-chain/Tests/upgrade.stable.out.32bit
        sql/test/emptydb-upgrade-chain/Tests/upgrade.stable.out.int128
        sql/test/emptydb-upgrade-chain/Tests/upgrade.stable.out.ppc64
        sql/test/emptydb-upgrade-chain/Tests/upgrade.stable.out.ppc64.int128
        sql/test/emptydb-upgrade-hge/Tests/upgrade.stable.out.int128
        sql/test/emptydb-upgrade/Tests/upgrade.stable.out
        sql/test/emptydb-upgrade/Tests/upgrade.stable.out.32bit
        sql/test/emptydb-upgrade/Tests/upgrade.stable.out.int128
        sql/test/emptydb/Tests/check.stable.out
        sql/test/emptydb/Tests/check.stable.out.32bit
        sql/test/emptydb/Tests/check.stable.out.int128
        sql/test/miscellaneous/Tests/simple_plans.stable.out
        sql/test/testdb-upgrade-chain-hge/Tests/upgrade.stable.out.int128
        sql/test/testdb-upgrade-chain/Tests/upgrade.stable.out
        sql/test/testdb-upgrade-chain/Tests/upgrade.stable.out.32bit
        sql/test/testdb-upgrade-chain/Tests/upgrade.stable.out.int128
        sql/test/testdb-upgrade-hge/Tests/upgrade.stable.out.int128
        sql/test/testdb-upgrade/Tests/upgrade.stable.out
        sql/test/testdb-upgrade/Tests/upgrade.stable.out.32bit
        sql/test/testdb-upgrade/Tests/upgrade.stable.out.int128
Branch: default
Log Message:

Merge with Oct2020 branch.


diffs (truncated from 2712 to 300 lines):

diff --git a/gdk/gdk_align.c b/gdk/gdk_align.c
--- a/gdk/gdk_align.c
+++ b/gdk/gdk_align.c
@@ -370,8 +370,10 @@ VIEWreset(BAT *b)
                b->batCapacity = cnt;
 
                /* insert all of v in b, and quit */
-               if (BATappend2(b, v, NULL, false, false) != GDK_SUCCEED)
+               if (BATappend2(b, v, NULL, false, false) != GDK_SUCCEED) {
+                       GDKerror("appending view failed");
                        goto bailout;
+               }
                BBPreclaim(v);
        }
        return GDK_SUCCEED;
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -2831,6 +2831,233 @@ hashjoin(BAT **r1p, BAT **r2p, BAT *l, B
        return GDK_FAIL;
 }
 
+/* population count: count number of 1 bits in a value */
+static inline uint32_t __attribute__((__const__))
+pop(uint32_t x)
+{
+#if defined(__GNUC__)
+       return (uint32_t) __builtin_popcount(x);
+#elif defined(_MSC_VER)
+       return (uint32_t) __popcnt((unsigned int) (x));
+#else
+       /* divide and conquer implementation (the two versions are
+        * essentially equivalent, but the first version is written a
+        * bit smarter) */
+#if 1
+       x -= (x >> 1) & ~0U/3 /* 0x55555555 */; /* 3-1=2; 2-1=1; 1-0=1; 0-0=0 */
+       x = (x & ~0U/5) + ((x >> 2) & ~0U/5) /* 0x33333333 */;
+       x = (x + (x >> 4)) & ~0UL/0x11 /* 0x0F0F0F0F */;
+       x = (x + (x >> 8)) & ~0UL/0x101 /* 0x00FF00FF */;
+       x = (x + (x >> 16)) & 0xFFFF /* ~0UL/0x10001 */;
+#else
+       x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
+       x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
+       x = (x & 0x0F0F0F0F) + ((x >>  4) & 0x0F0F0F0F);
+       x = (x & 0x00FF00FF) + ((x >>  8) & 0x00FF00FF);
+       x = (x & 0x0000FFFF) + ((x >> 16) & 0x0000FFFF);
+#endif
+       return x;
+#endif
+}
+
+/* Count the number of unique values for the first half and the complete
+ * set (the sample s of b) and return the two values in *cnt1 and
+ * *cnt2. In case of error, both values are 0. */
+static void
+count_unique(BAT *b, BAT *s, BUN *cnt1, BUN *cnt2)
+{
+       struct canditer ci;
+       BUN half;
+       BUN cnt = 0;
+       const void *v;
+       const char *bvals;
+       const char *bvars;
+       oid bval;
+       int bwidth;
+       oid i, o;
+       const char *nme;
+       BUN hb;
+       BATiter bi;
+       int (*cmp)(const void *, const void *);
+       const char *algomsg = "";
+       lng t0 = 0;
+
+       TRC_DEBUG_IF(ALGO) t0 = GDKusec();
+       canditer_init(&ci, b, s);
+       half = ci.ncand / 2;
+
+       if (b->tkey || ci.ncand <= 1 || BATtdense(b)) {
+               /* trivial: already unique */
+               *cnt1 = half;
+               *cnt2 = ci.ncand;
+               return;
+       }
+
+       if ((BATordered(b) && BATordered_rev(b)) ||
+           (b->ttype == TYPE_void && is_oid_nil(b->tseqbase))) {
+               /* trivial: all values are the same */
+               *cnt1 = *cnt2 = 1;
+               return;
+       }
+
+       assert(b->ttype != TYPE_void);
+
+       bvals = Tloc(b, 0);
+       if (b->tvarsized && b->ttype)
+               bvars = b->tvheap->base;
+       else
+               bvars = NULL;
+       bwidth = Tsize(b);
+       cmp = ATOMcompare(b->ttype);
+       bi = bat_iterator(b);
+
+       *cnt1 = *cnt2 = 0;
+
+       if (BATordered(b) || BATordered_rev(b)) {
+               const void *prev = NULL;
+               algomsg = "sorted";
+               for (i = 0; i < ci.ncand; i++) {
+                       if (i == half)
+                               *cnt1 = cnt;
+                       o = canditer_next(&ci);
+                       v = VALUE(b, o - b->hseqbase);
+                       if (prev == NULL || (*cmp)(v, prev) != 0) {
+                               cnt++;
+                       }
+                       prev = v;
+               }
+               *cnt2 = cnt;
+       } else if (ATOMbasetype(b->ttype) == TYPE_bte) {
+               unsigned char val;
+               uint32_t seen[256 / 32];
+
+               algomsg = "byte-sized atoms";
+               assert(bvars == NULL);
+               for (i = 0; i < ci.ncand; i++) {
+                       if (i == ci.ncand/ 2) {
+                               cnt = 0;
+                               for (int j = 0; j < 256 / 32; j++)
+                                       cnt += pop(seen[j]);
+                               *cnt1 = cnt;
+                       }
+                       o = canditer_next(&ci);
+                       val = ((const unsigned char *) bvals)[o - b->hseqbase];
+                       if (!(seen[val >> 5] & (1U << (val & 0x1F)))) {
+                               seen[val >> 5] |= 1U << (val & 0x1F);
+                       }
+               }
+               cnt = 0;
+               for (int j = 0; j < 256 / 32; j++)
+                       cnt += pop(seen[j]);
+               *cnt2 = cnt;
+       } else if (ATOMbasetype(b->ttype) == TYPE_sht) {
+               unsigned short val;
+               uint32_t *seen = NULL;
+
+               algomsg = "short-sized atoms";
+               assert(bvars == NULL);
+               seen = GDKzalloc((65536 / 32) * sizeof(seen[0]));
+               if (seen == NULL)
+                       return;
+               for (i = 0; i < ci.ncand; i++) {
+                       if (i == half) {
+                               cnt = 0;
+                               for (int j = 0; j < 65536 / 32; j++)
+                                       cnt += pop(seen[j]);
+                               *cnt1 = cnt;
+                       }
+                       o = canditer_next(&ci);
+                       val = ((const unsigned short *) bvals)[o - b->hseqbase];
+                       if (!(seen[val >> 5] & (1U << (val & 0x1F)))) {
+                               seen[val >> 5] |= 1U << (val & 0x1F);
+                       }
+               }
+               cnt = 0;
+               for (int j = 0; j < 65536 / 32; j++)
+                       cnt += pop(seen[j]);
+               *cnt2 = cnt;
+               GDKfree(seen);
+               seen = NULL;
+       } else {
+               BUN prb;
+               BUN p;
+               BUN mask;
+               Hash hs = {0};
+
+               GDKclrerr();    /* not interested in BAThash errors */
+               algomsg = "new partial hash";
+               nme = BBP_physical(b->batCacheid);
+               mask = HASHmask(ci.ncand);
+               if (mask < ((BUN) 1 << 16))
+                       mask = (BUN) 1 << 16;
+               if (snprintf(hs.heaplink.filename, 
sizeof(hs.heaplink.filename), "%s.thshunil%x", nme, (unsigned) THRgettid()) >= 
(int) sizeof(hs.heaplink.filename) ||
+                   snprintf(hs.heapbckt.filename, 
sizeof(hs.heapbckt.filename), "%s.thshunib%x", nme, (unsigned) THRgettid()) >= 
(int) sizeof(hs.heapbckt.filename) ||
+                   HASHnew(&hs, b->ttype, BUNlast(b), mask, BUN_NONE, false) 
!= GDK_SUCCEED) {
+                       GDKerror("cannot allocate hash table\n");
+                       HEAPfree(&hs.heaplink, true);
+                       HEAPfree(&hs.heapbckt, true);
+                       return;
+               }
+               for (i = 0; i < ci.ncand; i++) {
+                       if (i == half)
+                               *cnt1 = cnt;
+                       o = canditer_next(&ci);
+                       v = VALUE(b, o - b->hseqbase);
+                       prb = HASHprobe(&hs, v);
+                       for (hb = HASHget(&hs, prb);
+                            hb != HASHnil(&hs);
+                            hb = HASHgetlink(&hs, hb)) {
+                               if (cmp(v, BUNtail(bi, hb)) == 0)
+                                       break;
+                       }
+                       if (hb == HASHnil(&hs)) {
+                               p = o - b->hseqbase;
+                               cnt++;
+                               /* enter into hash table */
+                               HASHputlink(&hs, p, HASHget(&hs, prb));
+                               HASHput(&hs, prb, p);
+                       }
+               }
+               *cnt2 = cnt;
+               HEAPfree(&hs.heaplink, true);
+               HEAPfree(&hs.heapbckt, true);
+       }
+
+       TRC_DEBUG(ALGO, "b=" ALGOBATFMT ",s=" ALGOOPTBATFMT
+                 " -> " BUNFMT " " BUNFMT " (%s -- " LLFMT "usec)\n",
+                 ALGOBATPAR(b), ALGOOPTBATPAR(s),
+                 *cnt1, *cnt2, algomsg, GDKusec() - t0);
+
+       return;
+}
+
+static double
+guess_uniques(BAT *b, struct canditer *ci)
+{
+       BUN cnt1, cnt2;
+       BAT *s1;
+
+       if (b->tkey)
+               return (double) ci->ncand;
+
+       if (ci->s) {
+               BAT *s2 = BATsample(ci->s, 1000);
+               s1 = BATproject(s2, ci->s);
+               BBPreclaim(s2);
+       } else {
+               s1 = BATsample(b, 1000);
+       }
+       BUN n2 = BATcount(s1);
+       BUN n1 = n2 / 2;
+       count_unique(b, s1, &cnt1, &cnt2);
+       BBPreclaim(s1);
+
+       double A = (double) (cnt2 - cnt1) / (n2 - n1);
+       double B = cnt1 - n1 * A;
+
+       return A * ci->ncand + B;
+}
+
 #define MASK_EQ                1
 #define MASK_LT                2
 #define MASK_GT                4
@@ -3279,7 +3506,15 @@ leftjoin(BAT **r1p, BAT **r2p, BAT *l, B
                /* no hash table, so cost includes time to build the
                 * hash table (single scan) plus the time to do the
                 * lookups (also single scan, we assume some chains) */
-               rcost = lci.ncand * 1.1;
+               PROPrec *prop = BATgetprop(r, GDK_NUNIQUE);
+               if (prop) {
+                       /* we know number of unique values, assume some
+                        * chains */
+                       rcost = lci.ncand * 1.1 * ((double) BATcount(r) / 
prop->v.val.oval);
+               } else {
+                       /* guess number of unique value and work with that */
+                       rcost = lci.ncand * 1.1 * ((double) BATcount(r) / 
guess_uniques(r, &rci));
+               }
 #ifdef PERSISTENTHASH
                /* only count the cost of creating the hash for
                 * non-persistent bats */
@@ -3328,7 +3563,16 @@ leftjoin(BAT **r1p, BAT **r2p, BAT *l, B
                        /* no hash table, so cost includes time to build the
                         * hash table (single scan) plus the time to do the
                         * lookups (also single scan, we assume some chains) */
-                       lcost = rci.ncand * 1.1;
+                       PROPrec *prop = BATgetprop(l, GDK_NUNIQUE);
+                       if (prop) {
+                               /* we know number of unique values, assume some
+                                * chains */
+                               lcost = rci.ncand * 1.1 * ((double) BATcount(l) 
/ prop->v.val.oval);
+                       } else {
+                               /* guess number of unique value and work
+                                * with that */
+                               lcost = rci.ncand * 1.1 * ((double) BATcount(l) 
/ guess_uniques(l, &lci));
+                       }
 #ifdef PERSISTENTHASH
                        /* only count the cost of creating the hash
                         * for non-persistent bats */
@@ -3650,7 +3894,15 @@ BATjoin(BAT **r1p, BAT **r2p, BAT *l, BA
                /* no hash table, so cost includes time to build the
                 * hash table (single scan) plus the time to do the
                 * lookups (also single scan, we assume some chains) */
-               lcost = rci.ncand * 1.1;
+               PROPrec *prop = BATgetprop(l, GDK_NUNIQUE);
+               if (prop) {
+                       /* we know number of unique values, assume some
+                        * chains */
+                       lcost = rci.ncand * 1.1 * ((double) BATcount(l) / 
prop->v.val.oval);
+               } else {
+                       /* guess number of unique value and work with that */
+                       lcost = rci.ncand * 1.1 * ((double) BATcount(l) / 
guess_uniques(l, &lci));
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to