Changeset: bb94557d58a1 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=bb94557d58a1 Added Files: sql/test/BugTracker-2013/Tests/corrupt-after-restart.Bug-3282.py sql/test/BugTracker-2013/Tests/corrupt-after-restart.Bug-3282.stable.err sql/test/BugTracker-2013/Tests/corrupt-after-restart.Bug-3282.stable.out Modified Files: clients/Tests/exports.stable.out gdk/gdk.h gdk/gdk_batop.c gdk/gdk_join.c gdk/gdk_relop.mx gdk/gdk_select.c gdk/gdk_storage.c monetdb5/extras/jaql/Tests/join00.stable.out monetdb5/extras/jaql/Tests/join01.stable.out monetdb5/extras/jaql/Tests/join02.stable.out monetdb5/extras/jaql/jaqltests/Tests/join.stable.out monetdb5/mal/Tests/tst034.stable.out monetdb5/modules/kernel/aggr.c monetdb5/modules/kernel/algebra.mx monetdb5/modules/mal/Tests/All monetdb5/modules/mal/mat.c sql/test/BugTracker-2013/Tests/All sql/test/leaks/Tests/check0.stable.out sql/test/leaks/Tests/check1.stable.out sql/test/leaks/Tests/check2.stable.out sql/test/leaks/Tests/check3.stable.out sql/test/leaks/Tests/check4.stable.out sql/test/leaks/Tests/check5.stable.out sql/test/leaks/Tests/drop3.stable.out sql/test/leaks/Tests/select1.stable.out sql/test/leaks/Tests/select2.stable.out sql/test/leaks/Tests/temp1.stable.out sql/test/leaks/Tests/temp2.stable.out sql/test/leaks/Tests/temp3.stable.out Branch: rdf Log Message:
Merge with default branch diffs (truncated from 4696 to 300 lines): diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out --- a/clients/Tests/exports.stable.out +++ b/clients/Tests/exports.stable.out @@ -115,8 +115,6 @@ void BATderiveHeadProps(BAT *b, int expe void BATderiveProps(BAT *b, int expensive); BAT *BATextend(BAT *b, BUN newcap); BAT *BATfakeCommit(BAT *b); -BAT *BATfetch(BAT *b, BAT *s); -BAT *BATfetchjoin(BAT *b, BAT *s, BUN estimate); int BATgetaccess(BAT *b); PROPrec *BATgetprop(BAT *b, int idx); gdk_return BATgroup(BAT **groups, BAT **extents, BAT **histo, BAT *b, BAT *g, BAT *e, BAT *h); @@ -135,7 +133,6 @@ BAT *BATgroupvariance_population(BAT *b, BAT *BATgroupvariance_sample(BAT *b, BAT *g, BAT *e, BAT *s, int tp, int skip_nils, int abort_on_error); BUN BATgrows(BAT *b); BAT *BAThash(BAT *b, BUN masksize); -BAT *BAThashjoin(BAT *l, BAT *r, BUN estimate); BAT *BAThistogram(BAT *b); BAT *BATimprints(BAT *b); BAT *BATins(BAT *b, BAT *c, bit force); @@ -155,7 +152,6 @@ BAT *BATmaterialize(BAT *b); BAT *BATmaterializeh(BAT *b); size_t BATmemsize(BAT *b, int dirty); BAT *BATmergecand(BAT *a, BAT *b); -BAT *BATmergejoin(BAT *l, BAT *r, BUN estimate); int BATmmap(BAT *b, int hb, int tb, int hh, int th, int force); BAT *BATmode(BAT *b, int onoff); int BATmultiprintf(stream *f, int argc, BAT *argv[], int printoid, int order, int printorderby); @@ -196,6 +192,7 @@ BAT *BATsort_rev(BAT *b); BAT *BATssort(BAT *b); BAT *BATssort_rev(BAT *b); gdk_return BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, BUN estimate); +gdk_return BATsubleftfetchjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, BUN estimate); gdk_return BATsubleftjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, BUN estimate); gdk_return BATsubouterjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, BUN estimate); BAT *BATsubselect(BAT *b, BAT *s, const void *tl, const void *th, int li, int hi, int anti); @@ -747,15 +744,11 @@ str ALGcrossproduct2(int *l, int *r, int str ALGexist(bit *ret, int *bid, ptr val); str ALGexistBUN(bit *ret, int *bid, ptr val, ptr tval); str ALGfetch(ptr ret, int *bid, lng *pos); -str ALGfetchbat(int *ret, int *bid, int *sid); str ALGfetchint(int *ret, int *bid, int *pos); -str ALGfetchjoin(int *result, int *lid, int *rid); -str ALGfetchjoinestimate(int *result, int *lid, int *rid, lng *estimate); str ALGfetchoid(int *ret, int *bid, oid *pos); str ALGfind(ptr ret, int *bid, ptr val); str ALGfragment(int *result, int *bid, ptr hlow, ptr hhigh, ptr tlow, ptr thigh); str ALGgroupby(int *res, int *gids, int *cnts); -str ALGhashjoin(int *result, int *lid, int *rid); str ALGhistogram(int *result, int *bid); str ALGhistogram_rev(int *result, int *bid); str ALGhmarkp(int *result, int *bid, int *nr_parts, int *part_nr); @@ -791,7 +784,6 @@ str ALGmax_sht(sht *res, int *bid); str ALGmax_wrd(wrd *res, int *bid); str ALGmaxany(ptr result, int *bid); str ALGmerge(int *result, int *bid); -str ALGmergejoin(int *result, int *lid, int *rid); str ALGmin_bte(bte *res, int *bid); str ALGmin_dbl(dbl *res, int *bid); str ALGmin_flt(flt *res, int *bid); diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -2151,7 +2151,6 @@ gdk_export oid OIDnew(oid inc); * operations. */ gdk_export BAT *BAThash(BAT *b, BUN masksize); -gdk_export BAT *BAThashjoin(BAT *l, BAT *r, BUN estimate); /* low level functions */ @@ -3147,7 +3146,6 @@ gdk_export BAT *BATconstant(int tt, cons gdk_export BAT *BATconst(BAT *l, int tt, const void *val); gdk_export BAT *BATthetajoin(BAT *l, BAT *r, int mode, BUN estimate); gdk_export BAT *BATsemijoin(BAT *l, BAT *r); -gdk_export BAT *BATmergejoin(BAT *l, BAT *r, BUN estimate); gdk_export BAT *BATjoin(BAT *l, BAT *r, BUN estimate); gdk_export BAT *BATantijoin(BAT *l, BAT *r); gdk_export BAT *BATleftjoin(BAT *l, BAT *r, BUN estimate); @@ -3159,11 +3157,10 @@ gdk_export gdk_return BATsubouterjoin(BA gdk_export gdk_return BATsubthetajoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, const char *op, BUN estimate); gdk_export gdk_return BATsubsemijoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, BUN estimate); gdk_export gdk_return BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, BUN estimate); +gdk_export gdk_return BATsubleftfetchjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, BUN estimate); gdk_export BAT *BATproject(BAT *l, BAT *r); gdk_export BAT *BATslice(BAT *b, BUN low, BUN high); -gdk_export BAT *BATfetch(BAT *b, BAT *s); -gdk_export BAT *BATfetchjoin(BAT *b, BAT *s, BUN estimate); gdk_export BAT *BATleftfetchjoin(BAT *b, BAT *s, BUN estimate); gdk_export BAT *BATsunique(BAT *b); diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c --- a/gdk/gdk_batop.c +++ b/gdk/gdk_batop.c @@ -1319,7 +1319,7 @@ BATsubsort(BAT **sorted, BAT **order, BA return GDK_SUCCEED; } if (o) { - bn = BATleftfetchjoin(o, b, BATcount(b)); + bn = BATproject(o, b); if (bn == NULL) goto error; if (bn->ttype == TYPE_void || isVIEW(bn)) { diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c --- a/gdk/gdk_join.c +++ b/gdk/gdk_join.c @@ -180,7 +180,7 @@ binsearch(const oid *rcand, oid offset, */ static gdk_return mergejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, - int nil_matches, int nil_on_miss, int semi) + int nil_matches, int nil_on_miss, int semi, int must_match) { BUN lstart, lend, lcnt; const oid *lcand, *lcandend; @@ -213,7 +213,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT ALGODEBUG fprintf(stderr, "#mergejoin(l=%s#" BUNFMT "[%s]%s%s," "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s," "sr=%s#" BUNFMT "%s%s,nil_matches=%d," - "nil_on_miss=%d,semi=%d)\n", + "nil_on_miss=%d,semi=%d,must_match=%d)\n", BATgetId(l), BATcount(l), ATOMname(l->ttype), l->tsorted ? "-sorted" : "", l->trevsorted ? "-revsorted" : "", @@ -226,7 +226,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0, sr && sr->tsorted ? "-sorted" : "", sr && sr->trevsorted ? "-revsorted" : "", - nil_matches, nil_on_miss, semi); + nil_matches, nil_on_miss, semi, must_match); assert(BAThdense(l)); assert(BAThdense(r)); @@ -234,6 +234,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT assert(r->tsorted || r->trevsorted); assert(sl == NULL || sl->tsorted); assert(sr == NULL || sr->tsorted); + assert(!nil_on_miss || !must_match); /* can't have both */ CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend); CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend); @@ -255,6 +256,18 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT if (lstart == lend || (!nil_on_miss && rstart == rend)) { /* nothing to do: there are no matches */ + if (must_match && lstart < lend) { + GDKerror("mergejoin(%s,%s) does not hit always => can't use fetchjoin.\n", BATgetId(l), BATgetId(r)); + goto bailout; + } + ALGODEBUG fprintf(stderr, "#mergejoin(l=%s,r=%s)=(%s#"BUNFMT"%s%s,%s#"BUNFMT"%s%s\n", + BATgetId(l), BATgetId(r), + BATgetId(r1), BATcount(r1), + r1->tsorted ? "-sorted" : "", + r1->trevsorted ? "-revsorted" : "", + BATgetId(r2), BATcount(r2), + r2->tsorted ? "-sorted" : "", + r2->trevsorted ? "-revsorted" : ""); return GDK_SUCCEED; } if (!nil_on_miss) { @@ -266,6 +279,18 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT /* nothing to do: all values in either l or r * (or both) are nil, and we don't have to * match nil values */ + if (must_match) { + GDKerror("mergejoin(%s,%s) does not hit always => can't use fetchjoin.\n", BATgetId(l), BATgetId(r)); + goto bailout; + } + ALGODEBUG fprintf(stderr, "#mergejoin(l=%s,r=%s)=(%s#"BUNFMT"%s%s,%s#"BUNFMT"%s%s\n", + BATgetId(l), BATgetId(r), + BATgetId(r1), BATcount(r1), + r1->tsorted ? "-sorted" : "", + r1->trevsorted ? "-revsorted" : "", + BATgetId(r2), BATcount(r2), + r2->tsorted ? "-sorted" : "", + r2->trevsorted ? "-revsorted" : ""); return GDK_SUCCEED; } if ((l->ttype == TYPE_void && l->tseqbase == oid_nil && @@ -277,6 +302,18 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT /* nothing to do: all values in l are nil, and * we can guarantee there are no nil values in * r, or vice versa */ + if (must_match) { + GDKerror("mergejoin(%s,%s) does not hit always => can't use fetchjoin.\n", BATgetId(l), BATgetId(r)); + goto bailout; + } + ALGODEBUG fprintf(stderr, "#mergejoin(l=%s,r=%s)=(%s#"BUNFMT"%s%s,%s#"BUNFMT"%s%s\n", + BATgetId(l), BATgetId(r), + BATgetId(r1), BATcount(r1), + r1->tsorted ? "-sorted" : "", + r1->trevsorted ? "-revsorted" : "", + BATgetId(r2), BATcount(r2), + r2->tsorted ? "-sorted" : "", + r2->trevsorted ? "-revsorted" : ""); return GDK_SUCCEED; } } @@ -288,9 +325,10 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT nl > 0; lscan++) nl >>= 1; - equal_order = l->tsorted == r->tsorted || - l->trevsorted == r->trevsorted; - lordering = l->tsorted && (r->tsorted|| !equal_order) ? 1 : -1; + equal_order = (l->tsorted && r->tsorted) || + (l->trevsorted && r->trevsorted && + l->ttype != TYPE_void && r->ttype != TYPE_void); + lordering = l->tsorted && (r->tsorted || !equal_order) ? 1 : -1; rordering = equal_order ? lordering : -lordering; } else { /* if l not sorted, we will always use binary search @@ -344,16 +382,23 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT rcandorig = rcand; rstartorig = rstart; while (lcand ? lcand < lcandend : lstart < lend) { - if (!nil_on_miss && lscan > 0) { - /* if next value in r too far away (more than - * lscan from current position in l), use - * binary search on l to skip over - * non-matching values, but only if l is - * sorted (lscan > 0) and we don't have to - * insert nils (outer join) - * - * next value to match in r is first if - * equal_order is set, last otherwise */ + if (!nil_on_miss && !must_match && lscan > 0) { + /* If l is sorted (lscan > 0), we look at the + * next value in r to see whether we can jump + * over a large section of l using binary + * search. We do this by looking ahead in l + * (lscan far, to be precise) and seeing if + * the value there is still too "small" + * (definition depends on sort order of l). + * If it is, we use binary search on l, + * otherwise we scan l for the next position + * with a value greater than or equal to the + * value in r. Obviously, we can only do this + * if we're not inserting NILs for missing + * values in r (nil_on_miss set, i.e., outer + * join). + * The next value to in r is the first if + * equal_order is set, the last otherwise. */ if (rcand) { v = VALUE(r, (equal_order ? rcand[0] : rcandend[-1]) - r->hseqbase); } else if (rvals) { @@ -374,6 +419,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT rval = rend - 1 + r->tseqbase; v = (const char *) &rval; } + /* here, v points to next value in r */ if (lcand) { if (lscan < (BUN) (lcandend - lcand) && lordering * cmp(VALUE(l, lcand[lscan] - l->hseqbase), @@ -426,15 +472,33 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT rcand = rcandorig; rstart = rstartorig; } - /* v is the value we're going to work with in this - * iteration; count number of equal values in left */ + /* Here we determine the next value in l that we are + * going to try to match in r. We will also count the + * number of occurrences in l of that value. + * Afterwards, v points to the value and nl is the + * number of times it occurs. Also, lstart/lcand will + * point to the next value to be considered (ready for + * the next iteration). + * If there are many equal values in l (more than + * lscan), we will use binary search to find the end + * of the sequence. Obviously, we can do this only if + * l is actually sorted (lscan > 0). */ nl = 1; /* we'll match (at least) one in l */ nr = 0; /* maybe we won't match anything in r */ if (lcand) { v = VALUE(l, lcand[0] - l->hseqbase); - while (++lcand < lcandend && - cmp(v, VALUE(l, lcand[0] - l->hseqbase)) == 0) - nl++; + if (lscan > 0 && + lscan < (BUN) (lcandend - lcand) && + cmp(v, VALUE(l, lcand[lscan] - l->hseqbase)) == 0) { + /* lots of equal values: use binary + * search to find end */ + nl = binsearch(lcand, l->hseqbase, lvals, lvars, lwidth, lscan, lcandend - lcand, v, cmp, lordering, 1); + lcand += nl; + } else { + while (++lcand < lcandend && + cmp(v, VALUE(l, lcand[0] - l->hseqbase)) == 0) + nl++; + } } else if (lvals) { v = VALUE(l, lstart); if (loff == wrd_nil) { @@ -445,9 +509,23 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT v = (const char *) &lval; } else { /* compare values without offset */ - while (++lstart < lend && - cmp(v, VALUE(l, lstart)) == 0) _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list