Changeset: d6720878cd47 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d6720878cd47
Modified Files:
        gdk/gdk_join.c
Branch: pushdown
Log Message:

Implemented better (hopefully) choices for which join algorithm to use.


diffs (truncated from 654 to 300 lines):

diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -2376,21 +2376,6 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
                nr++;                                                   \
        } while (false)
 
-#define HASHloop_bound_TYPE(vals, h, hb, v, lo, hi, TYPE)      \
-       for (hb = HASHget(h, hash_##TYPE(h, &v));               \
-            hb != HASHnil(h);                                  \
-            hb = HASHgetlink(h,hb))                            \
-               if (hb >= (lo) && hb < (hi) &&                  \
-                   v == vals[hb])
-
-#define HASHloop_bound(bi, h, hb, v, lo, hi)           \
-       for (hb = HASHget(h, HASHprobe((h), v));        \
-            hb != HASHnil(h);                          \
-            hb = HASHgetlink(h,hb))                    \
-               if (hb >= (lo) && hb < (hi) &&          \
-                   (cmp == NULL ||                     \
-                    (*cmp)(v, BUNtail(bi, hb)) == 0))
-
 #define HASHJOIN(TYPE)                                                 \
        do {                                                            \
                TYPE *rvals = Tloc(r, 0);                               \
@@ -2419,16 +2404,37 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
                                        if (semi)                       \
                                                break;                  \
                                }                                       \
+                       } else if (rci->tpe != cand_dense) {            \
+                               for (rb = HASHget(hsh, hash_##TYPE(hsh, &v)); \
+                                    rb != HASHnil(hsh);                \
+                                    rb = HASHgetlink(hsh, rb)) {       \
+                                       if (rb >= rl && rb < rh &&      \
+                                           v == rvals[rb] &&           \
+                                           canditer_contains(rci, ro = (oid) 
(rb - roff + rseq))) { \
+                                               if (only_misses) {      \
+                                                       nr++;           \
+                                                       break;          \
+                                               }                       \
+                                               HASHLOOPBODY();         \
+                                               if (semi)               \
+                                                       break;          \
+                                       }                               \
+                               }                                       \
                        } else {                                        \
-                               HASHloop_bound_TYPE(rvals, hsh, rb, v, rl, rh, 
TYPE) { \
-                                       ro = (oid) (rb - rl + rseq);    \
-                                       if (only_misses) {              \
-                                               nr++;                   \
-                                               break;                  \
+                               for (rb = HASHget(hsh, hash_##TYPE(hsh, &v)); \
+                                    rb != HASHnil(hsh);                \
+                                    rb = HASHgetlink(hsh, rb)) {       \
+                                       if (rb >= rl && rb < rh &&      \
+                                           v == rvals[rb]) {           \
+                                               if (only_misses) {      \
+                                                       nr++;           \
+                                                       break;          \
+                                               }                       \
+                                               ro = (oid) (rb - roff + rseq); \
+                                               HASHLOOPBODY();         \
+                                               if (semi)               \
+                                                       break;          \
                                        }                               \
-                                       HASHLOOPBODY();                 \
-                                       if (semi)                       \
-                                               break;                  \
                                }                                       \
                        }                                               \
                        if (nr == 0) {                                  \
@@ -2478,11 +2484,13 @@ hashjoin(BAT **r1p, BAT **r2p, BAT *l, B
         struct canditer *restrict lci, struct canditer *restrict rci,
         bool nil_matches, bool nil_on_miss, bool semi, bool only_misses,
         bool not_in,
-        BUN estimate, lng t0, bool swapped, bool phash, const char *reason)
+        BUN estimate, lng t0, bool swapped, bool hash, bool phash,
+        const char *reason)
 {
        oid lo, ro;
        BATiter ri;
-       BUN rb;
+       BUN rb, roff = 0;
+       /* rl, rh: lower and higher bounds for BUN values in hash table */
        BUN rl, rh;
        oid rseq;
        BUN nr;
@@ -2532,57 +2540,44 @@ hashjoin(BAT **r1p, BAT **r2p, BAT *l, B
        rl = rci->seq - r->hseqbase;
        rh = canditer_last(rci) + 1 - r->hseqbase;
        if (phash) {
+               /* there is a hash on the parent which we should use */
                BAT *b = BBPdescriptor(VIEWtparent(r));
-               assert(rci->tpe == cand_dense && rci->ncand == BATcount(r));
                TRC_DEBUG(ALGO, "%s(%s): using "
                          "parent(" ALGOBATFMT ") for hash\n",
                          __func__,
                          BATgetId(r), ALGOBATPAR(b));
-               rl += (BUN) ((r->theap.base - b->theap.base) >> r->tshift);
-               rh += rl;
+               hsh = b->thash;
+               roff = (BUN) ((r->theap.base - b->theap.base) >> r->tshift);
+               rl += roff;
+               rh += roff;
                r = b;
-       }
-
-       hsh = NULL;
-       if (rci->tpe == cand_dense) {
-               if (BATcheckhash(r) &&
-                   BATcount(r) / r->thash->nheads * lci->ncand < lci->ncand + 
rci->ncand) {
-                       /* there is a hash already and the collision
-                        * chains aren't too long, use it */
-                       TRC_DEBUG(ALGO, "%s(%s): using "
-                                 "existing hash with candidate list\n",
-                                 __func__, BATgetId(r));
-                       hsh = r->thash;
-               } else if (rci->ncand == BATcount(r) ||
-                          (!r->batTransient &&
-                           rci->ncand > BATcount(r) * 4 / 5)) {
-                       /* if there is (effectively) no candidate
-                        * list, or if b is persistent and we're
-                        * joining with the majority of b, create a
-                        * hash on b */
-                       if (BAThash(r) != GDK_SUCCEED) {
-                               hsh = NULL;
-                               goto bailout;
-                       }
-                       hsh = r->thash;
-               }
-       }
-       if (hsh == NULL) {
-               /* create a hash for the part of b that is candidate */
+       } else if (hash) {
+               /* there is a hash on r which we should use */
+               hsh = r->thash;
+       } else if (rci->tpe != cand_dense || rci->ncand != BATcount(r)) {
+               /* we need to create a hash on r specific for the
+                * candidate list */
                char ext[32];
-               assert(!phash);
-               assert(rci->s != NULL);
+               assert(rci->s);
                TRC_DEBUG(ALGO, "%s(%s): creating "
                          "hash for candidate list\n",
                          __func__,
                          BATgetId(r));
-               if (snprintf(ext, sizeof(ext), "thshjn%x", rci->s->batCacheid) 
>= (int) sizeof(ext))
+               if (snprintf(ext, sizeof(ext), "thshjn%x",
+                            rci->s->batCacheid) >= (int) sizeof(ext))
                        goto bailout;
                if ((hsh = BAThash_impl(r, rci, ext)) == NULL) {
                        goto bailout;
                }
                hash_cand = true;
+       } else {
+               /* we need to create a hash on r */
+               if (BAThash(r) != GDK_SUCCEED)
+                       goto bailout;
+               hsh = r->thash;
        }
+       assert(hsh != NULL);
+
        ri = bat_iterator(r);
 
        if (not_in && !r->tnonil) {
@@ -2602,9 +2597,16 @@ hashjoin(BAT **r1p, BAT **r2p, BAT *l, B
                                }
                        }
                } else {
-                       HASHloop_bound(ri, hsh, rb, nil, rl, rh) {
-                               return nomatch(r1p, r2p, l, r, lci,
-                                              false, false, __func__, t0);
+                       for (rb = HASHget(hsh, HASHprobe(hsh, nil));
+                            rb != HASHnil(hsh);
+                            rb = HASHgetlink(hsh, rb)) {
+                               if (rb >= rl && rb < rh &&
+                                   (cmp == NULL ||
+                                    (*cmp)(v, BUNtail(ri, rb)) == 0)) {
+                                       return nomatch(r1p, r2p, l, r, lci,
+                                                      false, false,
+                                                      __func__, t0);
+                               }
                        }
                }
        }
@@ -2660,16 +2662,37 @@ hashjoin(BAT **r1p, BAT **r2p, BAT *l, B
                                        if (semi)
                                                break;
                                }
+                       } else if (rci->tpe != cand_dense) {
+                               for (rb = HASHget(hsh, HASHprobe(hsh, v));
+                                    rb != HASHnil(hsh);
+                                    rb = HASHgetlink(hsh, rb)) {
+                                       if (rb >= rl && rb < rh &&
+                                           (*(cmp))(v, BUNtail(ri, rb)) == 0 &&
+                                           canditer_contains(rci, ro = (oid) 
(rb - roff + rseq))) {
+                                               if (only_misses) {
+                                                       nr++;
+                                                       break;
+                                               }
+                                               HASHLOOPBODY();
+                                               if (semi)
+                                                       break;
+                                       }
+                               }
                        } else {
-                               HASHloop_bound(ri, hsh, rb, v, rl, rh) {
-                                       ro = (oid) (rb - rl + rseq);
-                                       if (only_misses) {
-                                               nr++;
-                                               break;
+                               for (rb = HASHget(hsh, HASHprobe(hsh, v));
+                                    rb != HASHnil(hsh);
+                                    rb = HASHgetlink(hsh, rb)) {
+                                       if (rb >= rl && rb < rh &&
+                                           (*(cmp))(v, BUNtail(ri, rb)) == 0) {
+                                               if (only_misses) {
+                                                       nr++;
+                                                       break;
+                                               }
+                                               ro = (oid) (rb - roff + rseq);
+                                               HASHLOOPBODY();
+                                               if (semi)
+                                                       break;
                                        }
-                                       HASHLOOPBODY();
-                                       if (semi)
-                                               break;
                                }
                        }
                        if (nr == 0) {
@@ -3405,7 +3428,7 @@ leftjoin(BAT **r1p, BAT **r2p, BAT *l, B
 {
        BUN lcnt, rcnt;
        struct canditer lci, rci;
-       bool phash = false;
+       bool rhash, prhash = false;
        bat parent;
 
        /* only_misses implies left output only */
@@ -3484,13 +3507,145 @@ leftjoin(BAT **r1p, BAT **r2p, BAT *l, B
                                 nil_matches, nil_on_miss, semi, only_misses,
                                 not_in, estimate, t0, false, func);
        }
-       phash = rci.tpe == cand_dense &&
-               rci.ncand == BATcount(r) &&
-               VIEWtparent(r) != 0 &&
-               BATcount(BBP_cache(VIEWtparent(r))) == BATcount(r);
+       rhash = BATcheckhash(r);
+       double rcost = 0;
+       if (rhash) {
+               /* average chain length */
+               rcost = (double) BATcount(r) / r->thash->nheads;
+       } else if ((parent = VIEWtparent(r)) != 0) {
+               BAT *b = BBPdescriptor(parent);
+               rhash = prhash = BATcheckhash(b);
+               if (prhash) {
+                       /* average chain length */
+                       rcost = (double) BATcount(b) / b->thash->nheads;
+               }
+       }
+       if (!rhash) {
+               /* no hash table, so cost includes time to build the
+                * hash table (single scan) plus the time to do the
+                * lookups (also single scan, we assume some chains) */
+               rcost = (double) rci.ncand + lci.ncand * 1.1;
+       } else {
+               if (rci.noids > 0) {
+                       /* if we need to do binary search on candidate
+                        * list, take that into account */
+                       rcost *= log2((double) rci.noids) + 1;
+               }
+               /* all of this so far for each lookup of which we have
+                * rci.ncand */
+               rcost *= lci.ncand;
+               if (rci.ncand < BATcount(r) &&
+                   rci.ncand + lci.ncand * 1.1 < rcost) {
+                       /* it's cheaper to rebuild the hash table for
+                        * just the candidates (this saves on the
+                        * binary searches), again, assume some
+                        * chains */
+                       rhash = prhash = false;
+                       rcost = rci.ncand + lci.ncand * 1.1;
+               }
+       }
+
+       if (!nil_on_miss && !only_misses && !not_in && r2p == NULL) {
+               /* maybe do a hash join on the swapped operands; if we
+                * do, we need to sort the output, so we take that into
+                * account as well */
+               bool lhash = BATcheckhash(l);
+               bool plhash = false;
+               double lcost = 0;
+               if (lhash) {
+                       /* average chain length */
+                       lcost = (double) BATcount(l) / l->thash->nheads;
+               } else if ((parent = VIEWtparent(l)) != 0) {
+                       BAT *b = BBPdescriptor(parent);
+                       lhash = plhash = BATcheckhash(b);
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to