Changeset: bb94557d58a1 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=bb94557d58a1
Added Files:
        sql/test/BugTracker-2013/Tests/corrupt-after-restart.Bug-3282.py
        sql/test/BugTracker-2013/Tests/corrupt-after-restart.Bug-3282.stable.err
        sql/test/BugTracker-2013/Tests/corrupt-after-restart.Bug-3282.stable.out
Modified Files:
        clients/Tests/exports.stable.out
        gdk/gdk.h
        gdk/gdk_batop.c
        gdk/gdk_join.c
        gdk/gdk_relop.mx
        gdk/gdk_select.c
        gdk/gdk_storage.c
        monetdb5/extras/jaql/Tests/join00.stable.out
        monetdb5/extras/jaql/Tests/join01.stable.out
        monetdb5/extras/jaql/Tests/join02.stable.out
        monetdb5/extras/jaql/jaqltests/Tests/join.stable.out
        monetdb5/mal/Tests/tst034.stable.out
        monetdb5/modules/kernel/aggr.c
        monetdb5/modules/kernel/algebra.mx
        monetdb5/modules/mal/Tests/All
        monetdb5/modules/mal/mat.c
        sql/test/BugTracker-2013/Tests/All
        sql/test/leaks/Tests/check0.stable.out
        sql/test/leaks/Tests/check1.stable.out
        sql/test/leaks/Tests/check2.stable.out
        sql/test/leaks/Tests/check3.stable.out
        sql/test/leaks/Tests/check4.stable.out
        sql/test/leaks/Tests/check5.stable.out
        sql/test/leaks/Tests/drop3.stable.out
        sql/test/leaks/Tests/select1.stable.out
        sql/test/leaks/Tests/select2.stable.out
        sql/test/leaks/Tests/temp1.stable.out
        sql/test/leaks/Tests/temp2.stable.out
        sql/test/leaks/Tests/temp3.stable.out
Branch: rdf
Log Message:

Merge with default branch


diffs (truncated from 4696 to 300 lines):

diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -115,8 +115,6 @@ void BATderiveHeadProps(BAT *b, int expe
 void BATderiveProps(BAT *b, int expensive);
 BAT *BATextend(BAT *b, BUN newcap);
 BAT *BATfakeCommit(BAT *b);
-BAT *BATfetch(BAT *b, BAT *s);
-BAT *BATfetchjoin(BAT *b, BAT *s, BUN estimate);
 int BATgetaccess(BAT *b);
 PROPrec *BATgetprop(BAT *b, int idx);
 gdk_return BATgroup(BAT **groups, BAT **extents, BAT **histo, BAT *b, BAT *g, 
BAT *e, BAT *h);
@@ -135,7 +133,6 @@ BAT *BATgroupvariance_population(BAT *b,
 BAT *BATgroupvariance_sample(BAT *b, BAT *g, BAT *e, BAT *s, int tp, int 
skip_nils, int abort_on_error);
 BUN BATgrows(BAT *b);
 BAT *BAThash(BAT *b, BUN masksize);
-BAT *BAThashjoin(BAT *l, BAT *r, BUN estimate);
 BAT *BAThistogram(BAT *b);
 BAT *BATimprints(BAT *b);
 BAT *BATins(BAT *b, BAT *c, bit force);
@@ -155,7 +152,6 @@ BAT *BATmaterialize(BAT *b);
 BAT *BATmaterializeh(BAT *b);
 size_t BATmemsize(BAT *b, int dirty);
 BAT *BATmergecand(BAT *a, BAT *b);
-BAT *BATmergejoin(BAT *l, BAT *r, BUN estimate);
 int BATmmap(BAT *b, int hb, int tb, int hh, int th, int force);
 BAT *BATmode(BAT *b, int onoff);
 int BATmultiprintf(stream *f, int argc, BAT *argv[], int printoid, int order, 
int printorderby);
@@ -196,6 +192,7 @@ BAT *BATsort_rev(BAT *b);
 BAT *BATssort(BAT *b);
 BAT *BATssort_rev(BAT *b);
 gdk_return BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, 
BUN estimate);
+gdk_return BATsubleftfetchjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, 
BAT *sr, BUN estimate);
 gdk_return BATsubleftjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT 
*sr, BUN estimate);
 gdk_return BATsubouterjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT 
*sr, BUN estimate);
 BAT *BATsubselect(BAT *b, BAT *s, const void *tl, const void *th, int li, int 
hi, int anti);
@@ -747,15 +744,11 @@ str ALGcrossproduct2(int *l, int *r, int
 str ALGexist(bit *ret, int *bid, ptr val);
 str ALGexistBUN(bit *ret, int *bid, ptr val, ptr tval);
 str ALGfetch(ptr ret, int *bid, lng *pos);
-str ALGfetchbat(int *ret, int *bid, int *sid);
 str ALGfetchint(int *ret, int *bid, int *pos);
-str ALGfetchjoin(int *result, int *lid, int *rid);
-str ALGfetchjoinestimate(int *result, int *lid, int *rid, lng *estimate);
 str ALGfetchoid(int *ret, int *bid, oid *pos);
 str ALGfind(ptr ret, int *bid, ptr val);
 str ALGfragment(int *result, int *bid, ptr hlow, ptr hhigh, ptr tlow, ptr 
thigh);
 str ALGgroupby(int *res, int *gids, int *cnts);
-str ALGhashjoin(int *result, int *lid, int *rid);
 str ALGhistogram(int *result, int *bid);
 str ALGhistogram_rev(int *result, int *bid);
 str ALGhmarkp(int *result, int *bid, int *nr_parts, int *part_nr);
@@ -791,7 +784,6 @@ str ALGmax_sht(sht *res, int *bid);
 str ALGmax_wrd(wrd *res, int *bid);
 str ALGmaxany(ptr result, int *bid);
 str ALGmerge(int *result, int *bid);
-str ALGmergejoin(int *result, int *lid, int *rid);
 str ALGmin_bte(bte *res, int *bid);
 str ALGmin_dbl(dbl *res, int *bid);
 str ALGmin_flt(flt *res, int *bid);
diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -2151,7 +2151,6 @@ gdk_export oid OIDnew(oid inc);
  * operations.
  */
 gdk_export BAT *BAThash(BAT *b, BUN masksize);
-gdk_export BAT *BAThashjoin(BAT *l, BAT *r, BUN estimate);
 
 /* low level functions */
 
@@ -3147,7 +3146,6 @@ gdk_export BAT *BATconstant(int tt, cons
 gdk_export BAT *BATconst(BAT *l, int tt, const void *val);
 gdk_export BAT *BATthetajoin(BAT *l, BAT *r, int mode, BUN estimate);
 gdk_export BAT *BATsemijoin(BAT *l, BAT *r);
-gdk_export BAT *BATmergejoin(BAT *l, BAT *r, BUN estimate);
 gdk_export BAT *BATjoin(BAT *l, BAT *r, BUN estimate);
 gdk_export BAT *BATantijoin(BAT *l, BAT *r);
 gdk_export BAT *BATleftjoin(BAT *l, BAT *r, BUN estimate);
@@ -3159,11 +3157,10 @@ gdk_export gdk_return BATsubouterjoin(BA
 gdk_export gdk_return BATsubthetajoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, 
BAT *sl, BAT *sr, const char *op, BUN estimate);
 gdk_export gdk_return BATsubsemijoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT 
*sl, BAT *sr, BUN estimate);
 gdk_export gdk_return BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT 
*sl, BAT *sr, BUN estimate);
+gdk_export gdk_return BATsubleftfetchjoin(BAT **r1p, BAT **r2p, BAT *l, BAT 
*r, BAT *sl, BAT *sr, BUN estimate);
 gdk_export BAT *BATproject(BAT *l, BAT *r);
 
 gdk_export BAT *BATslice(BAT *b, BUN low, BUN high);
-gdk_export BAT *BATfetch(BAT *b, BAT *s);
-gdk_export BAT *BATfetchjoin(BAT *b, BAT *s, BUN estimate);
 gdk_export BAT *BATleftfetchjoin(BAT *b, BAT *s, BUN estimate);
 
 gdk_export BAT *BATsunique(BAT *b);
diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c
--- a/gdk/gdk_batop.c
+++ b/gdk/gdk_batop.c
@@ -1319,7 +1319,7 @@ BATsubsort(BAT **sorted, BAT **order, BA
                return GDK_SUCCEED;
        }
        if (o) {
-               bn = BATleftfetchjoin(o, b, BATcount(b));
+               bn = BATproject(o, b);
                if (bn == NULL)
                        goto error;
                if (bn->ttype == TYPE_void || isVIEW(bn)) {
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -180,7 +180,7 @@ binsearch(const oid *rcand, oid offset,
  */
 static gdk_return
 mergejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr,
-         int nil_matches, int nil_on_miss, int semi)
+         int nil_matches, int nil_on_miss, int semi, int must_match)
 {
        BUN lstart, lend, lcnt;
        const oid *lcand, *lcandend;
@@ -213,7 +213,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
        ALGODEBUG fprintf(stderr, "#mergejoin(l=%s#" BUNFMT "[%s]%s%s,"
                          "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
                          "sr=%s#" BUNFMT "%s%s,nil_matches=%d,"
-                         "nil_on_miss=%d,semi=%d)\n",
+                         "nil_on_miss=%d,semi=%d,must_match=%d)\n",
                          BATgetId(l), BATcount(l), ATOMname(l->ttype),
                          l->tsorted ? "-sorted" : "",
                          l->trevsorted ? "-revsorted" : "",
@@ -226,7 +226,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                          sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
                          sr && sr->tsorted ? "-sorted" : "",
                          sr && sr->trevsorted ? "-revsorted" : "",
-                         nil_matches, nil_on_miss, semi);
+                         nil_matches, nil_on_miss, semi, must_match);
 
        assert(BAThdense(l));
        assert(BAThdense(r));
@@ -234,6 +234,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
        assert(r->tsorted || r->trevsorted);
        assert(sl == NULL || sl->tsorted);
        assert(sr == NULL || sr->tsorted);
+       assert(!nil_on_miss || !must_match); /* can't have both */
 
        CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend);
        CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend);
@@ -255,6 +256,18 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
 
        if (lstart == lend || (!nil_on_miss && rstart == rend)) {
                /* nothing to do: there are no matches */
+               if (must_match && lstart < lend) {
+                       GDKerror("mergejoin(%s,%s) does not hit always => can't 
use fetchjoin.\n", BATgetId(l), BATgetId(r));
+                       goto bailout;
+               }
+               ALGODEBUG fprintf(stderr, 
"#mergejoin(l=%s,r=%s)=(%s#"BUNFMT"%s%s,%s#"BUNFMT"%s%s\n",
+                         BATgetId(l), BATgetId(r),
+                         BATgetId(r1), BATcount(r1),
+                         r1->tsorted ? "-sorted" : "",
+                         r1->trevsorted ? "-revsorted" : "",
+                         BATgetId(r2), BATcount(r2),
+                         r2->tsorted ? "-sorted" : "",
+                         r2->trevsorted ? "-revsorted" : "");
                return GDK_SUCCEED;
        }
        if (!nil_on_miss) {
@@ -266,6 +279,18 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                        /* nothing to do: all values in either l or r
                         * (or both) are nil, and we don't have to
                         * match nil values */
+                       if (must_match) {
+                               GDKerror("mergejoin(%s,%s) does not hit always 
=> can't use fetchjoin.\n", BATgetId(l), BATgetId(r));
+                               goto bailout;
+                       }
+                       ALGODEBUG fprintf(stderr, 
"#mergejoin(l=%s,r=%s)=(%s#"BUNFMT"%s%s,%s#"BUNFMT"%s%s\n",
+                                 BATgetId(l), BATgetId(r),
+                                 BATgetId(r1), BATcount(r1),
+                                 r1->tsorted ? "-sorted" : "",
+                                 r1->trevsorted ? "-revsorted" : "",
+                                 BATgetId(r2), BATcount(r2),
+                                 r2->tsorted ? "-sorted" : "",
+                                 r2->trevsorted ? "-revsorted" : "");
                        return GDK_SUCCEED;
                }
                if ((l->ttype == TYPE_void && l->tseqbase == oid_nil &&
@@ -277,6 +302,18 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                        /* nothing to do: all values in l are nil, and
                         * we can guarantee there are no nil values in
                         * r, or vice versa */
+                       if (must_match) {
+                               GDKerror("mergejoin(%s,%s) does not hit always 
=> can't use fetchjoin.\n", BATgetId(l), BATgetId(r));
+                               goto bailout;
+                       }
+                       ALGODEBUG fprintf(stderr, 
"#mergejoin(l=%s,r=%s)=(%s#"BUNFMT"%s%s,%s#"BUNFMT"%s%s\n",
+                                 BATgetId(l), BATgetId(r),
+                                 BATgetId(r1), BATcount(r1),
+                                 r1->tsorted ? "-sorted" : "",
+                                 r1->trevsorted ? "-revsorted" : "",
+                                 BATgetId(r2), BATcount(r2),
+                                 r2->tsorted ? "-sorted" : "",
+                                 r2->trevsorted ? "-revsorted" : "");
                        return GDK_SUCCEED;
                }
        }
@@ -288,9 +325,10 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                     nl > 0;
                     lscan++)
                        nl >>= 1;
-               equal_order = l->tsorted == r->tsorted ||
-                       l->trevsorted == r->trevsorted;
-               lordering = l->tsorted && (r->tsorted|| !equal_order) ? 1 : -1;
+               equal_order = (l->tsorted && r->tsorted) ||
+                       (l->trevsorted && r->trevsorted &&
+                        l->ttype != TYPE_void && r->ttype != TYPE_void);
+               lordering = l->tsorted && (r->tsorted || !equal_order) ? 1 : -1;
                rordering = equal_order ? lordering : -lordering;
        } else {
                /* if l not sorted, we will always use binary search
@@ -344,16 +382,23 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
        rcandorig = rcand;
        rstartorig = rstart;
        while (lcand ? lcand < lcandend : lstart < lend) {
-               if (!nil_on_miss && lscan > 0) {
-                       /* if next value in r too far away (more than
-                        * lscan from current position in l), use
-                        * binary search on l to skip over
-                        * non-matching values, but only if l is
-                        * sorted (lscan > 0) and we don't have to
-                        * insert nils (outer join)
-                        *
-                        * next value to match in r is first if
-                        * equal_order is set, last otherwise */
+               if (!nil_on_miss && !must_match && lscan > 0) {
+                       /* If l is sorted (lscan > 0), we look at the
+                        * next value in r to see whether we can jump
+                        * over a large section of l using binary
+                        * search.  We do this by looking ahead in l
+                        * (lscan far, to be precise) and seeing if
+                        * the value there is still too "small"
+                        * (definition depends on sort order of l).
+                        * If it is, we use binary search on l,
+                        * otherwise we scan l for the next position
+                        * with a value greater than or equal to the
+                        * value in r.  Obviously, we can only do this
+                        * if we're not inserting NILs for missing
+                        * values in r (nil_on_miss set, i.e., outer
+                        * join).
+                        * The next value to in r is the first if
+                        * equal_order is set, the last otherwise. */
                        if (rcand) {
                                v = VALUE(r, (equal_order ? rcand[0] : 
rcandend[-1]) - r->hseqbase);
                        } else if (rvals) {
@@ -374,6 +419,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                                        rval = rend - 1 + r->tseqbase;
                                v = (const char *) &rval;
                        }
+                       /* here, v points to next value in r */
                        if (lcand) {
                                if (lscan < (BUN) (lcandend - lcand) &&
                                    lordering * cmp(VALUE(l, lcand[lscan] - 
l->hseqbase),
@@ -426,15 +472,33 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                        rcand = rcandorig;
                        rstart = rstartorig;
                }
-               /* v is the value we're going to work with in this
-                * iteration; count number of equal values in left */
+               /* Here we determine the next value in l that we are
+                * going to try to match in r.  We will also count the
+                * number of occurrences in l of that value.
+                * Afterwards, v points to the value and nl is the
+                * number of times it occurs.  Also, lstart/lcand will
+                * point to the next value to be considered (ready for
+                * the next iteration).
+                * If there are many equal values in l (more than
+                * lscan), we will use binary search to find the end
+                * of the sequence.  Obviously, we can do this only if
+                * l is actually sorted (lscan > 0). */
                nl = 1;         /* we'll match (at least) one in l */
                nr = 0;         /* maybe we won't match anything in r */
                if (lcand) {
                        v = VALUE(l, lcand[0] - l->hseqbase);
-                       while (++lcand < lcandend &&
-                              cmp(v, VALUE(l, lcand[0] - l->hseqbase)) == 0)
-                               nl++;
+                       if (lscan > 0 &&
+                           lscan < (BUN) (lcandend - lcand) &&
+                           cmp(v, VALUE(l, lcand[lscan] - l->hseqbase)) == 0) {
+                               /* lots of equal values: use binary
+                                * search to find end */
+                               nl = binsearch(lcand, l->hseqbase, lvals, 
lvars, lwidth, lscan, lcandend - lcand, v, cmp, lordering, 1);
+                               lcand += nl;
+                       } else {
+                               while (++lcand < lcandend &&
+                                      cmp(v, VALUE(l, lcand[0] - l->hseqbase)) 
== 0)
+                                       nl++;
+                       }
                } else if (lvals) {
                        v = VALUE(l, lstart);
                        if (loff == wrd_nil) {
@@ -445,9 +509,23 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                                v = (const char *) &lval;
                        } else {
                                /* compare values without offset */
-                               while (++lstart < lend &&
-                                      cmp(v, VALUE(l, lstart)) == 0)
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to