Changeset: 1dad1e7691a0 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=1dad1e7691a0 Modified Files: gdk/gdk_join.c gdk/gdk_private.h gdk/gdk_select.c Branch: Jul2015 Log Message:
Calculate maximum join result size and use that to cap growth of BATs. This should fix bug 3791. diffs (truncated from 359 to 300 lines): diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c --- a/gdk/gdk_join.c +++ b/gdk/gdk_join.c @@ -108,10 +108,41 @@ joinparamcheck(BAT *l, BAT *r1, BAT *r2, } /* Create the result bats for a join. */ -static gdk_return -joininitresults(BAT **r1p, BAT **r2p, BUN size, const char *func) +static BUN +joininitresults(BAT **r1p, BAT **r2p, BUN lcnt, BUN rcnt, int lkey, int rkey, int semi, int nil_on_miss, BUN estimate, const char *func) { BAT *r1, *r2; + BUN maxsize, size; + + lkey |= lcnt <= 1; + rkey |= rcnt <= 1; + + if (rkey | semi) { + /* each entry left matches at most one on right, in + * case nil_on_miss is also set, each entry matches + * exactly one */ + maxsize = lcnt; + } else if (lkey) { + /* each entry on right is matched at most once */ + if (nil_on_miss) { + /* one entry left could match all right, and + * all other entries left match nil */ + maxsize = lcnt + rcnt - 1; + } else { + maxsize = rcnt; + } + } else { + /* in the worst case we have a full cross product */ + if (lcnt == 0 || rcnt == 0) + maxsize = nil_on_miss ? lcnt : 0; + else if (BUN_MAX / lcnt >= rcnt) + maxsize = BUN_MAX; + else + maxsize = lcnt * rcnt; + } + size = estimate == BUN_NONE ? lcnt : estimate; + if (size > maxsize) + size = maxsize; r1 = BATnew(TYPE_void, TYPE_oid, size, TRANSIENT); r2 = BATnew(TYPE_void, TYPE_oid, size, TRANSIENT); @@ -120,7 +151,7 @@ joininitresults(BAT **r1p, BAT **r2p, BU BBPreclaim(r2); *r1p = *r2p = NULL; GDKerror("%s: cannot create output BATs.\n", func); - return GDK_FAIL; + return BUN_NONE; } BATseqbase(r1, 0); BATseqbase(r2, 0); @@ -138,7 +169,7 @@ joininitresults(BAT **r1p, BAT **r2p, BU r2->tdense = 1; *r1p = r1; *r2p = r2; - return GDK_SUCCEED; + return maxsize; } #define VALUE(s, x) (s##vars ? \ @@ -809,7 +840,8 @@ mergejoin_void(BAT *r1, BAT *r2, BAT *l, */ static gdk_return mergejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, - int nil_matches, int nil_on_miss, int semi, int must_match) + int nil_matches, int nil_on_miss, int semi, int must_match, + BUN maxsize) { BUN lstart, lend, lcnt; const oid *lcand, *lcandend; @@ -1537,6 +1569,8 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT BUN newcap = (BUN) ((double) total / (total - (lcand ? (BUN) (lcandend - lcand) : (lend - lstart))) * (BATcount(r1) + nl * nr) * 1.1); if (newcap < nl * nr + BATcount(r1)) newcap = nl * nr + BATcount(r1) + 1024; + if (newcap > maxsize) + newcap = maxsize; /* make sure heap.free is set properly before * extending */ BATsetcount(r1, BATcount(r1)); @@ -1724,6 +1758,8 @@ binsearchcand(const oid *cand, BUN lo, B do { \ if (BUNlast(r1) == BATcapacity(r1)) { \ newcap = BATgrows(r1); \ + if (newcap > maxsize) \ + newcap = maxsize; \ BATsetcount(r1, BATcount(r1)); \ BATsetcount(r2, BATcount(r2)); \ if (BATextend(r1, newcap) != GDK_SUCCEED || \ @@ -1752,7 +1788,7 @@ binsearchcand(const oid *cand, BUN lo, B simple_EQ(v, BUNtloc(bi, hb), TYPE)) static gdk_return -hashjoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, int nil_on_miss, int semi, int must_match) +hashjoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, int nil_on_miss, int semi, int must_match, BUN maxsize) { BUN lstart, lend, lcnt; const oid *lcand = NULL, *lcandend = NULL; @@ -1916,6 +1952,8 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT * r2->tkey = 0; if (BUNlast(r1) == BATcapacity(r1)) { newcap = BATgrows(r1); + if (newcap > maxsize) + newcap = maxsize; BATsetcount(r1, BATcount(r1)); BATsetcount(r2, BATcount(r2)); if (BATextend(r1, newcap) != GDK_SUCCEED || @@ -2111,7 +2149,7 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT * #define MASK_NE (MASK_LT | MASK_GT) static gdk_return -thetajoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int opcode) +thetajoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int opcode, BUN maxsize) { BUN lstart, lend, lcnt; const oid *lcand = NULL, *lcandend = NULL; @@ -2270,6 +2308,8 @@ thetajoin(BAT *r1, BAT *r2, BAT *l, BAT continue; if (BUNlast(r1) == BATcapacity(r1)) { newcap = BATgrows(r1); + if (newcap > maxsize) + newcap = maxsize; BATsetcount(r1, BATcount(r1)); BATsetcount(r2, BATcount(r2)); if (BATextend(r1, newcap) != GDK_SUCCEED || @@ -2335,7 +2375,7 @@ thetajoin(BAT *r1, BAT *r2, BAT *l, BAT static gdk_return bandjoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, - const void *c1, const void *c2, int li, int hi) + const void *c1, const void *c2, int li, int hi, BUN maxsize) { BUN lstart, lend, lcnt; const oid *lcand = NULL, *lcandend = NULL; @@ -2671,6 +2711,8 @@ bandjoin(BAT *r1, BAT *r2, BAT *l, BAT * } if (BUNlast(r1) == BATcapacity(r1)) { newcap = BATgrows(r1); + if (newcap > maxsize) + newcap = maxsize; BATsetcount(r1, BATcount(r1)); BATsetcount(r2, BATcount(r2)); if (BATextend(r1, newcap) != GDK_SUCCEED || @@ -2739,7 +2781,7 @@ static gdk_return subleftjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, BUN estimate, int nil_on_miss, int semi, int must_match, const char *name) { BAT *r1, *r2; - BUN lcount, rcount; + BUN lcount, rcount, maxsize; *r1p = NULL; *r2p = NULL; @@ -2769,7 +2811,7 @@ subleftjoin(BAT **r1p, BAT **r2p, BAT *l return GDK_SUCCEED; } - if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ? BATcount(sl) : BATcount(l), name) != GDK_SUCCEED) + if ((maxsize = joininitresults(&r1, &r2, lcount, rcount, l->tkey, r->tkey, semi, nil_on_miss, estimate, name)) == BUN_NONE) return GDK_FAIL; *r1p = r1; *r2p = r2; @@ -2782,9 +2824,9 @@ subleftjoin(BAT **r1p, BAT **r2p, BAT *l lcount < 1024 || BATcount(r) * (Tsize(r) + (r->T->vheap ? r->T->vheap->size : 0) + 2 * sizeof(BUN)) > GDK_mem_maxsize / (GDKnr_threads ? GDKnr_threads : 1))) return mergejoin(r1, r2, l, r, sl, sr, nil_matches, - nil_on_miss, semi, must_match); + nil_on_miss, semi, must_match, maxsize); return hashjoin(r1, r2, l, r, sl, sr, nil_matches, - nil_on_miss, semi, must_match); + nil_on_miss, semi, must_match, maxsize); } /* Perform an equi-join over l and r. Returns two new, aligned, @@ -2836,6 +2878,7 @@ gdk_return BATsubthetajoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int op, int nil_matches, BUN estimate) { BAT *r1, *r2; + BUN maxsize; int opcode = 0; /* encode operator as a bit mask into opcode */ @@ -2866,14 +2909,12 @@ BATsubthetajoin(BAT **r1p, BAT **r2p, BA *r2p = NULL; if (joinparamcheck(l, r, NULL, sl, sr, "BATsubthetajoin") != GDK_SUCCEED) return GDK_FAIL; - if (joininitresults(&r1, &r2, - estimate != BUN_NONE ? estimate : sl ? BATcount(sl) : BATcount(l), - "BATsubthetajoin") != GDK_SUCCEED) + if ((maxsize = joininitresults(&r1, &r2, sl ? BATcount(sl) : BATcount(l), sr ? BATcount(sr) : BATcount(r), 0, 0, 0, 0, estimate, "BATsubthetajoin")) == BUN_NONE) return GDK_FAIL; *r1p = r1; *r2p = r2; - return thetajoin(r1, r2, l, r, sl, sr, opcode); + return thetajoin(r1, r2, l, r, sl, sr, opcode, maxsize); } gdk_return @@ -2882,6 +2923,7 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r1, *r2; BUN lcount, rcount, lpcount, rpcount; BUN lsize, rsize; + BUN maxsize; int lhash, rhash; bat lparent, rparent; int swap; @@ -2913,7 +2955,7 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, *r2p = r2; return GDK_SUCCEED; } - if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ? BATcount(sl) : BATcount(l), "BATsubjoin") != GDK_SUCCEED) + if ((maxsize = joininitresults(&r1, &r2, lcount, rcount, l->tkey, r->tkey, 0, 0, estimate, "BATsubjoin")) == BUN_NONE) return GDK_FAIL; *r1p = r1; *r2p = r2; @@ -2949,9 +2991,9 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, } else if ((l->tsorted || l->trevsorted) && (r->tsorted || r->trevsorted)) { /* both sorted, smallest on left */ if (BATcount(l) <= BATcount(r)) - return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0); + return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0, maxsize); else - return mergejoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0); + return mergejoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0, maxsize); } else if (lhash && rhash) { /* both have hash, smallest on right */ swap = lcount < rcount; @@ -2967,14 +3009,14 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, * "large" and the smaller of the two isn't too large * (i.e. prefer hash over binary search, but only if * the hash table doesn't cause thrashing) */ - return mergejoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0); + return mergejoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0, maxsize); } else if ((r->tsorted || r->trevsorted) && (r->ttype == TYPE_void || lcount < 1024 || MIN(lsize, rsize) > mem_size)) { /* only right is sorted, don't swap; but only if left * is "large" and the smaller of the two isn't too * large (i.e. prefer hash over binary search, but * only if the hash table doesn't cause thrashing) */ - return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0); + return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0, maxsize); } else if ((l->batPersistence == PERSISTENT || (lparent != 0 && BBPquickdesc(abs(lparent), 0)->batPersistence == PERSISTENT)) && @@ -2999,9 +3041,9 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, swap = 1; } if (swap) { - return hashjoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0); + return hashjoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0, maxsize); } else { - return hashjoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0); + return hashjoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0, maxsize); } } @@ -3010,19 +3052,18 @@ BATsubbandjoin(BAT **r1p, BAT **r2p, BAT const void *c1, const void *c2, int li, int hi, BUN estimate) { BAT *r1, *r2; + BUN maxsize; *r1p = NULL; *r2p = NULL; if (joinparamcheck(l, r, NULL, sl, sr, "BATsubbandjoin") != GDK_SUCCEED) return GDK_FAIL; - if (joininitresults(&r1, &r2, - estimate != BUN_NONE ? estimate : sl ? BATcount(sl) : BATcount(l), - "BATsubbandjoin") != GDK_SUCCEED) + if ((maxsize = joininitresults(&r1, &r2, sl ? BATcount(sl) : BATcount(l), sr ? BATcount(sr) : BATcount(r), 0, 0, 0, 0, estimate, "BATsubbandjoin")) == BUN_NONE) return GDK_FAIL; *r1p = r1; *r2p = r2; - return bandjoin(r1, r2, l, r, sl, sr, c1, c2, li, hi); + return bandjoin(r1, r2, l, r, sl, sr, c1, c2, li, hi, maxsize); } gdk_return @@ -3030,21 +3071,20 @@ BATsubrangejoin(BAT **r1p, BAT **r2p, BA BAT *sl, BAT *sr, int li, int hi, BUN estimate) { BAT *r1, *r2; + BUN maxsize; *r1p = NULL; *r2p = NULL; if (joinparamcheck(l, rl, rh, sl, sr, "BATsubrangejoin") != GDK_SUCCEED) return GDK_FAIL; - if (joininitresults(&r1, &r2, - estimate != BUN_NONE ? estimate : sl ? BATcount(sl) : BATcount(l), - "BATsubrangejoin") != GDK_SUCCEED) + if ((maxsize = joininitresults(&r1, &r2, sl ? BATcount(sl) : BATcount(l), sr ? BATcount(sr) : BATcount(rl), 0, 0, 0, 0, estimate, "BATsubrangejoin")) == BUN_NONE) return GDK_FAIL; *r1p = r1; *r2p = r2; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list