Changeset: 1dad1e7691a0 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=1dad1e7691a0
Modified Files:
        gdk/gdk_join.c
        gdk/gdk_private.h
        gdk/gdk_select.c
Branch: Jul2015
Log Message:

Calculate maximum join result size and use that to cap growth of BATs.
This should fix bug 3791.


diffs (truncated from 359 to 300 lines):

diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -108,10 +108,41 @@ joinparamcheck(BAT *l, BAT *r1, BAT *r2,
 }
 
 /* Create the result bats for a join. */
-static gdk_return
-joininitresults(BAT **r1p, BAT **r2p, BUN size, const char *func)
+static BUN
+joininitresults(BAT **r1p, BAT **r2p, BUN lcnt, BUN rcnt, int lkey, int rkey, 
int semi, int nil_on_miss, BUN estimate, const char *func)
 {
        BAT *r1, *r2;
+       BUN maxsize, size;
+
+       lkey |= lcnt <= 1;
+       rkey |= rcnt <= 1;
+
+       if (rkey | semi) {
+               /* each entry left matches at most one on right, in
+                * case nil_on_miss is also set, each entry matches
+                * exactly one */
+               maxsize = lcnt;
+       } else if (lkey) {
+               /* each entry on right is matched at most once */
+               if (nil_on_miss) {
+                       /* one entry left could match all right, and
+                        * all other entries left match nil */
+                       maxsize = lcnt + rcnt - 1;
+               } else {
+                       maxsize = rcnt;
+               }
+       } else {
+               /* in the worst case we have a full cross product */
+               if (lcnt == 0 || rcnt == 0)
+                       maxsize = nil_on_miss ? lcnt : 0;
+               else if (BUN_MAX / lcnt >= rcnt)
+                       maxsize = BUN_MAX;
+               else
+                       maxsize = lcnt * rcnt;
+       }
+       size = estimate == BUN_NONE ? lcnt : estimate;
+       if (size > maxsize)
+               size = maxsize;
 
        r1 = BATnew(TYPE_void, TYPE_oid, size, TRANSIENT);
        r2 = BATnew(TYPE_void, TYPE_oid, size, TRANSIENT);
@@ -120,7 +151,7 @@ joininitresults(BAT **r1p, BAT **r2p, BU
                BBPreclaim(r2);
                *r1p = *r2p = NULL;
                GDKerror("%s: cannot create output BATs.\n", func);
-               return GDK_FAIL;
+               return BUN_NONE;
        }
        BATseqbase(r1, 0);
        BATseqbase(r2, 0);
@@ -138,7 +169,7 @@ joininitresults(BAT **r1p, BAT **r2p, BU
        r2->tdense = 1;
        *r1p = r1;
        *r2p = r2;
-       return GDK_SUCCEED;
+       return maxsize;
 }
 
 #define VALUE(s, x)    (s##vars ? \
@@ -809,7 +840,8 @@ mergejoin_void(BAT *r1, BAT *r2, BAT *l,
  */
 static gdk_return
 mergejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr,
-         int nil_matches, int nil_on_miss, int semi, int must_match)
+         int nil_matches, int nil_on_miss, int semi, int must_match,
+         BUN maxsize)
 {
        BUN lstart, lend, lcnt;
        const oid *lcand, *lcandend;
@@ -1537,6 +1569,8 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                        BUN newcap = (BUN) ((double) total / (total - (lcand ? 
(BUN) (lcandend - lcand) : (lend - lstart))) * (BATcount(r1) + nl * nr) * 1.1);
                        if (newcap < nl * nr + BATcount(r1))
                                newcap = nl * nr + BATcount(r1) + 1024;
+                       if (newcap > maxsize)
+                               newcap = maxsize;
                        /* make sure heap.free is set properly before
                         * extending */
                        BATsetcount(r1, BATcount(r1));
@@ -1724,6 +1758,8 @@ binsearchcand(const oid *cand, BUN lo, B
        do {                                                            \
                if (BUNlast(r1) == BATcapacity(r1)) {                   \
                        newcap = BATgrows(r1);                          \
+                       if (newcap > maxsize)                           \
+                               newcap = maxsize;                       \
                        BATsetcount(r1, BATcount(r1));                  \
                        BATsetcount(r2, BATcount(r2));                  \
                        if (BATextend(r1, newcap) != GDK_SUCCEED ||     \
@@ -1752,7 +1788,7 @@ binsearchcand(const oid *cand, BUN lo, B
                    simple_EQ(v, BUNtloc(bi, hb), TYPE))
 
 static gdk_return
-hashjoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, 
int nil_on_miss, int semi, int must_match)
+hashjoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, 
int nil_on_miss, int semi, int must_match, BUN maxsize)
 {
        BUN lstart, lend, lcnt;
        const oid *lcand = NULL, *lcandend = NULL;
@@ -1916,6 +1952,8 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT *
                                        r2->tkey = 0;
                                        if (BUNlast(r1) == BATcapacity(r1)) {
                                                newcap = BATgrows(r1);
+                                               if (newcap > maxsize)
+                                                       newcap = maxsize;
                                                BATsetcount(r1, BATcount(r1));
                                                BATsetcount(r2, BATcount(r2));
                                                if (BATextend(r1, newcap) != 
GDK_SUCCEED ||
@@ -2111,7 +2149,7 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT *
 #define MASK_NE                (MASK_LT | MASK_GT)
 
 static gdk_return
-thetajoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int opcode)
+thetajoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int opcode, BUN 
maxsize)
 {
        BUN lstart, lend, lcnt;
        const oid *lcand = NULL, *lcandend = NULL;
@@ -2270,6 +2308,8 @@ thetajoin(BAT *r1, BAT *r2, BAT *l, BAT 
                                continue;
                        if (BUNlast(r1) == BATcapacity(r1)) {
                                newcap = BATgrows(r1);
+                               if (newcap > maxsize)
+                                       newcap = maxsize;
                                BATsetcount(r1, BATcount(r1));
                                BATsetcount(r2, BATcount(r2));
                                if (BATextend(r1, newcap) != GDK_SUCCEED ||
@@ -2335,7 +2375,7 @@ thetajoin(BAT *r1, BAT *r2, BAT *l, BAT 
 
 static gdk_return
 bandjoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr,
-        const void *c1, const void *c2, int li, int hi)
+        const void *c1, const void *c2, int li, int hi, BUN maxsize)
 {
        BUN lstart, lend, lcnt;
        const oid *lcand = NULL, *lcandend = NULL;
@@ -2671,6 +2711,8 @@ bandjoin(BAT *r1, BAT *r2, BAT *l, BAT *
                        }
                        if (BUNlast(r1) == BATcapacity(r1)) {
                                newcap = BATgrows(r1);
+                               if (newcap > maxsize)
+                                       newcap = maxsize;
                                BATsetcount(r1, BATcount(r1));
                                BATsetcount(r2, BATcount(r2));
                                if (BATextend(r1, newcap) != GDK_SUCCEED ||
@@ -2739,7 +2781,7 @@ static gdk_return
 subleftjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int 
nil_matches, BUN estimate, int nil_on_miss, int semi, int must_match, const 
char *name)
 {
        BAT *r1, *r2;
-       BUN lcount, rcount;
+       BUN lcount, rcount, maxsize;
 
        *r1p = NULL;
        *r2p = NULL;
@@ -2769,7 +2811,7 @@ subleftjoin(BAT **r1p, BAT **r2p, BAT *l
                return GDK_SUCCEED;
        }
 
-       if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ? 
BATcount(sl) : BATcount(l), name) != GDK_SUCCEED)
+       if ((maxsize = joininitresults(&r1, &r2, lcount, rcount, l->tkey, 
r->tkey, semi, nil_on_miss, estimate, name)) == BUN_NONE)
                return GDK_FAIL;
        *r1p = r1;
        *r2p = r2;
@@ -2782,9 +2824,9 @@ subleftjoin(BAT **r1p, BAT **r2p, BAT *l
             lcount < 1024 ||
             BATcount(r) * (Tsize(r) + (r->T->vheap ? r->T->vheap->size : 0) + 
2 * sizeof(BUN)) > GDK_mem_maxsize / (GDKnr_threads ? GDKnr_threads : 1)))
                return mergejoin(r1, r2, l, r, sl, sr, nil_matches,
-                                nil_on_miss, semi, must_match);
+                                nil_on_miss, semi, must_match, maxsize);
        return hashjoin(r1, r2, l, r, sl, sr, nil_matches,
-                       nil_on_miss, semi, must_match);
+                       nil_on_miss, semi, must_match, maxsize);
 }
 
 /* Perform an equi-join over l and r.  Returns two new, aligned,
@@ -2836,6 +2878,7 @@ gdk_return
 BATsubthetajoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int 
op, int nil_matches, BUN estimate)
 {
        BAT *r1, *r2;
+       BUN maxsize;
        int opcode = 0;
 
        /* encode operator as a bit mask into opcode */
@@ -2866,14 +2909,12 @@ BATsubthetajoin(BAT **r1p, BAT **r2p, BA
        *r2p = NULL;
        if (joinparamcheck(l, r, NULL, sl, sr, "BATsubthetajoin") != 
GDK_SUCCEED)
                return GDK_FAIL;
-       if (joininitresults(&r1, &r2,
-                           estimate != BUN_NONE ? estimate : sl ? BATcount(sl) 
: BATcount(l),
-                           "BATsubthetajoin") != GDK_SUCCEED)
+       if ((maxsize = joininitresults(&r1, &r2, sl ? BATcount(sl) : 
BATcount(l), sr ? BATcount(sr) : BATcount(r), 0, 0, 0, 0, estimate, 
"BATsubthetajoin")) == BUN_NONE)
                return GDK_FAIL;
        *r1p = r1;
        *r2p = r2;
 
-       return thetajoin(r1, r2, l, r, sl, sr, opcode);
+       return thetajoin(r1, r2, l, r, sl, sr, opcode, maxsize);
 }
 
 gdk_return
@@ -2882,6 +2923,7 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l,
        BAT *r1, *r2;
        BUN lcount, rcount, lpcount, rpcount;
        BUN lsize, rsize;
+       BUN maxsize;
        int lhash, rhash;
        bat lparent, rparent;
        int swap;
@@ -2913,7 +2955,7 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l,
                *r2p = r2;
                return GDK_SUCCEED;
        }
-       if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ? 
BATcount(sl) : BATcount(l), "BATsubjoin") != GDK_SUCCEED)
+       if ((maxsize = joininitresults(&r1, &r2, lcount, rcount, l->tkey, 
r->tkey, 0, 0, estimate, "BATsubjoin")) == BUN_NONE)
                return GDK_FAIL;
        *r1p = r1;
        *r2p = r2;
@@ -2949,9 +2991,9 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l,
        } else if ((l->tsorted || l->trevsorted) && (r->tsorted || 
r->trevsorted)) {
                /* both sorted, smallest on left */
                if (BATcount(l) <= BATcount(r))
-                       return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 
0, 0);
+                       return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 
0, 0, maxsize);
                else
-                       return mergejoin(r2, r1, r, l, sr, sl, nil_matches, 0, 
0, 0);
+                       return mergejoin(r2, r1, r, l, sr, sl, nil_matches, 0, 
0, 0, maxsize);
        } else if (lhash && rhash) {
                /* both have hash, smallest on right */
                swap = lcount < rcount;
@@ -2967,14 +3009,14 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l,
                 * "large" and the smaller of the two isn't too large
                 * (i.e. prefer hash over binary search, but only if
                 * the hash table doesn't cause thrashing) */
-               return mergejoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0);
+               return mergejoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0, 
maxsize);
        } else if ((r->tsorted || r->trevsorted) &&
                   (r->ttype == TYPE_void || lcount < 1024 || MIN(lsize, rsize) 
> mem_size)) {
                /* only right is sorted, don't swap; but only if left
                 * is "large" and the smaller of the two isn't too
                 * large (i.e. prefer hash over binary search, but
                 * only if the hash table doesn't cause thrashing) */
-               return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0);
+               return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0, 
maxsize);
        } else if ((l->batPersistence == PERSISTENT ||
                    (lparent != 0 &&
                     BBPquickdesc(abs(lparent), 0)->batPersistence == 
PERSISTENT)) &&
@@ -2999,9 +3041,9 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l,
                swap = 1;
        }
        if (swap) {
-               return hashjoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0);
+               return hashjoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0, 
maxsize);
        } else {
-               return hashjoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0);
+               return hashjoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0, 
maxsize);
        }
 }
 
@@ -3010,19 +3052,18 @@ BATsubbandjoin(BAT **r1p, BAT **r2p, BAT
               const void *c1, const void *c2, int li, int hi, BUN estimate)
 {
        BAT *r1, *r2;
+       BUN maxsize;
 
        *r1p = NULL;
        *r2p = NULL;
        if (joinparamcheck(l, r, NULL, sl, sr, "BATsubbandjoin") != GDK_SUCCEED)
                return GDK_FAIL;
-       if (joininitresults(&r1, &r2,
-                           estimate != BUN_NONE ? estimate : sl ? BATcount(sl) 
: BATcount(l),
-                           "BATsubbandjoin") != GDK_SUCCEED)
+       if ((maxsize = joininitresults(&r1, &r2, sl ? BATcount(sl) : 
BATcount(l), sr ? BATcount(sr) : BATcount(r), 0, 0, 0, 0, estimate, 
"BATsubbandjoin")) == BUN_NONE)
                return GDK_FAIL;
        *r1p = r1;
        *r2p = r2;
 
-       return bandjoin(r1, r2, l, r, sl, sr, c1, c2, li, hi);
+       return bandjoin(r1, r2, l, r, sl, sr, c1, c2, li, hi, maxsize);
 }
 
 gdk_return
@@ -3030,21 +3071,20 @@ BATsubrangejoin(BAT **r1p, BAT **r2p, BA
                BAT *sl, BAT *sr, int li, int hi, BUN estimate)
 {
        BAT *r1, *r2;
+       BUN maxsize;
 
        *r1p = NULL;
        *r2p = NULL;
        if (joinparamcheck(l, rl, rh, sl, sr, "BATsubrangejoin") != GDK_SUCCEED)
                return GDK_FAIL;
-       if (joininitresults(&r1, &r2,
-                           estimate != BUN_NONE ? estimate : sl ? BATcount(sl) 
: BATcount(l),
-                           "BATsubrangejoin") != GDK_SUCCEED)
+       if ((maxsize = joininitresults(&r1, &r2, sl ? BATcount(sl) : 
BATcount(l), sr ? BATcount(sr) : BATcount(rl), 0, 0, 0, 0, estimate, 
"BATsubrangejoin")) == BUN_NONE)
                return GDK_FAIL;
        *r1p = r1;
        *r2p = r2;
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to