Changeset: a3b13a3f92d4 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=a3b13a3f92d4 Modified Files: gdk/gdk_join.c gdk/gdk_private.h gdk/gdk_search.c Branch: Jul2017 Log Message:
Merge with Dec2016 branch. diffs (truncated from 1081 to 300 lines): diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c --- a/gdk/gdk_join.c +++ b/gdk/gdk_join.c @@ -205,8 +205,8 @@ joininitresults(BAT **r1p, BAT **r2p, BU #define VALUE(s, x) (s##vars ? \ s##vars + VarHeapVal(s##vals, (x), s##width) : \ - s##vals + ((x) * s##width)) -#define FVALUE(s, x) (s##vals + ((x) * s##width)) + (const char *) s##vals + ((x) * s##width)) +#define FVALUE(s, x) ((const char *) s##vals + ((x) * s##width)) #define APPEND(b, o) (((oid *) b->theap.base)[b->batCount++] = (o)) @@ -762,21 +762,6 @@ mergejoin_void(BAT *r1, BAT *r2, BAT *l, return GDK_FAIL; } -/* Perform a "merge" join on l and r (if both are sorted) with - * optional candidate lists, or join using binary search on r if l is - * not sorted. The return BATs have already been created by the - * caller. - * - * If nil_matches is set, nil values are treated as ordinary values - * that can match; otherwise nil values never match. - * - * If nil_on_miss is set, a nil value is returned in r2 if there is no - * match in r for a particular value in l (left outer join). - * - * If semi is set, only a single set of values in r1/r2 is returned if - * there is a match of l in r, no matter how many matches there are in - * r; otherwise all matches are returned. - */ static gdk_return mergejoin_int(BAT *r1, BAT *r2, BAT *l, BAT *r, int nil_matches, BUN maxsize, lng t0, int swapped) @@ -1371,22 +1356,46 @@ mergejoin_lng(BAT *r1, BAT *r2, BAT *l, return GDK_FAIL; } +/* Perform a "merge" join on l and r (if both are sorted) with + * optional candidate lists, or join using binary search on r if l is + * not sorted. The return BATs have already been created by the + * caller. + * + * If nil_matches is set, nil values are treated as ordinary values + * that can match; otherwise nil values never match. + * + * If nil_on_miss is set, a nil value is returned in r2 if there is no + * match in r for a particular value in l (left outer join). + * + * If semi is set, only a single set of values in r1/r2 is returned if + * there is a match of l in r, no matter how many matches there are in + * r; otherwise all matches are returned. + * + * maxsize is the absolute maximum size the output BATs can become (if + * they were to become larger, we have a bug). + * + * t0 and swapped are only for debugging (ALGOMASK set in GDKdebug). + */ static gdk_return mergejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, int nil_on_miss, int semi, int only_misses, BUN maxsize, lng t0, int swapped) { - BUN lstart, lend, lcnt; + BUN lstart, lend; const oid *lcand, *lcandend; - BUN rstart, rend, rcnt, rstartorig; + BUN rstart, rend, rstartorig; + BUN lcnt, rcnt; /* not actively used, but needed for CANDINIT */ const oid *rcand, *rcandend, *rcandorig; + /* [lr]scan determine how far we look ahead in l/r in order to + * decide whether we want to do a binary search or a scan */ BUN lscan, rscan; - const char *lvals, *rvals; - const char *lvars, *rvars; - int lwidth, rwidth; + const void *lvals, *rvals; /* the values of l/r (NULL if dense) */ + const char *lvars, *rvars; /* the indirect values (NULL if fixed size) */ + int lwidth, rwidth; /* width of the values */ const void *nil = ATOMnilptr(l->ttype); int (*cmp)(const void *, const void *) = ATOMcompare(l->ttype); - const char *v, *prev = NULL; + const void *v; /* points to value under consideration */ + const void *prev = NULL; BUN nl, nr; BUN total; /* number of rows in l we scan */ int insert_nil; @@ -1400,10 +1409,10 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT * l/r: it determines the comparison function used */ int lordering, rordering; oid lv; - BUN i; + BUN i, j; /* counters */ int lskipped = 0; /* whether we skipped values in l */ - lng loff = 0, roff = 0; - oid lval = oid_nil, rval = oid_nil; + lng loff = 0, roff = 0; /* set if l/r is dense */ + oid lval = oid_nil, rval = oid_nil; /* temporary space to point v to */ if (sl == NULL && sr == NULL && !nil_on_miss && !semi && !only_misses && l->tsorted && r->tsorted && r2 != NULL) { @@ -1449,8 +1458,8 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend); CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend); total = lcand ? (BUN) (lcandend - lcand) : lend - lstart; - lvals = l->ttype == TYPE_void ? NULL : (const char *) Tloc(l, 0); - rvals = r->ttype == TYPE_void ? NULL : (const char *) Tloc(r, 0); + lvals = BATtvoid(l) ? NULL : Tloc(l, 0); + rvals = BATtvoid(r) ? NULL : Tloc(r, 0); if (l->tvarsized && l->ttype) { assert(r->tvarsized && r->ttype); lvars = l->tvheap->base; @@ -1468,14 +1477,14 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT if (lstart == lend || rstart == rend || (!nil_matches && - ((l->ttype == TYPE_void && l->tseqbase == oid_nil) || - (r->ttype == TYPE_void && r->tseqbase == oid_nil))) || - (l->ttype == TYPE_void && l->tseqbase == oid_nil && + ((BATtvoid(l) && l->tseqbase == oid_nil) || + (BATtvoid(r) && r->tseqbase == oid_nil))) || + (BATtvoid(l) && l->tseqbase == oid_nil && (r->tnonil || - (r->ttype == TYPE_void && r->tseqbase != oid_nil))) || - (r->ttype == TYPE_void && r->tseqbase == oid_nil && + (BATtvoid(r) && r->tseqbase != oid_nil))) || + (BATtvoid(r) && r->tseqbase == oid_nil && (l->tnonil || - (l->ttype == TYPE_void && l->tseqbase != oid_nil)))) { + (BATtvoid(l) && l->tseqbase != oid_nil)))) { /* there are no matches */ return nomatch(r1, r2, l, r, lstart, lend, lcand, lcandend, nil_on_miss, only_misses, "mergejoin", t0); @@ -1490,13 +1499,13 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT nl >>= 1; equal_order = (l->tsorted && r->tsorted) || (l->trevsorted && r->trevsorted && - l->ttype != TYPE_void && r->ttype != TYPE_void); + !BATtvoid(l) && !BATtvoid(r)); lordering = l->tsorted && (r->tsorted || !equal_order) ? 1 : -1; rordering = equal_order ? lordering : -lordering; } else { /* if l not sorted, we will always use binary search * on r */ - assert(l->ttype != TYPE_void); /* void is always sorted */ + assert(!BATtvoid(l)); /* void is always sorted */ lscan = 0; equal_order = 1; lordering = 1; @@ -1516,34 +1525,49 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT rscan++) nl >>= 1; - if (l->ttype == TYPE_void) { + if (BATtvoid(l)) { + assert(lvals == NULL); if (lcand) { lstart = 0; lend = (BUN) (lcandend - lcand); - lvals = (const char *) lcand; - lcand = NULL; - lwidth = SIZEOF_OID; } if (l->tseqbase == oid_nil) loff = lng_nil; else loff = (lng) l->tseqbase - (lng) l->hseqbase; } - if (r->ttype == TYPE_void) { + if (BATtvoid(r)) { + assert(rvals == NULL); if (rcand) { rstart = 0; rend = (BUN) (rcandend - rcand); - rvals = (const char *) rcand; - rcand = NULL; - rwidth = SIZEOF_OID; } if (r->tseqbase == oid_nil) roff = lng_nil; else roff = (lng) r->tseqbase - (lng) r->hseqbase; } - assert(lvals != NULL || lcand == NULL); - assert(rvals != NULL || rcand == NULL); + assert(loff == 0 || lvals == NULL); + assert(roff == 0 || rvals == NULL); + /* At this point the various variables that help us through + * the algorithm have been set. The table explains them. The + * first two columns are the inputs, the next three columns + * are the variables, the final two columns indicate how the + * variables can be used. + * + * l/r sl/sr | vals cand off | result value being matched + * -------------+-----------------+---------------------------------- + * dense NULL | NULL NULL set | i off==nil?nil:i+off + * dense dense | NULL NULL set | i off==nil?nil:i+off + * dense set | NULL set set | cand[i] off==nil?nil:cand[i]+off + * set NULL | set NULL 0 | i vals[i] + * set dense | set NULL 0 | i vals[i] + * set set | set set 0 | cand[i] vals[cand[i]] + * + * If {l,r}off is lng_nil, all values in the corresponding bat + * are oid_nil because the bat has type VOID and the tseqbase + * is nil. + */ rcandorig = rcand; rstartorig = rstart; @@ -1571,34 +1595,36 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT * value in r. * The next value to match in r is the first * if equal_order is set, the last - * otherwise. */ + * otherwise. + * When skipping over values in l, we count + * how many we skip in nlx. We need this in + * case only_misses or nil_on_miss is set, and + * to properly set the tdense property in the + * first output BAT. */ BUN nlx = 0; /* number of non-matching values in l */ if (rcand) { - if (rcand == rcandend) - v = NULL; - else + if (rcand == rcandend) { + v = NULL; /* no more values */ + } else if (roff != 0) { + rval = roff == lng_nil ? oid_nil : (oid) ((lng) (equal_order ? rcand[0] : rcandend[-1]) + roff); + v = &rval; + } else { v = VALUE(r, (equal_order ? rcand[0] : rcandend[-1]) - r->hseqbase); + } } else { if (rstart == rend) { v = NULL; } else if (rvals) { v = VALUE(r, equal_order ? rstart : rend - 1); - if (roff == lng_nil) { - rval = oid_nil; - v = (const char *) &rval; - } else if (roff != 0) { - rval = (oid) (*(const oid *)v + roff); - v = (const char *) &rval; - } } else { if (roff == lng_nil) rval = oid_nil; else if (equal_order) - rval = rstart + r->tseqbase; + rval = (oid) ((lng) rstart + r->tseqbase); else - rval = rend - 1 + r->tseqbase; - v = (const char *) &rval; + rval = (oid) ((lng) rend - 1 + r->tseqbase); + v = &rval; } } /* here, v points to next value in r, or if @@ -1611,46 +1637,50 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT nlx = lend - lstart; lstart = lend; } + } else if (loff == lng_nil) { + /* all l values are NIL, and the type is OID */ + if (* (oid *) v != oid_nil) { + /* value we're looking at in r + * is not NIL, so we match + * nothing */ + if (lcand) { + nlx = (BUN) (lcandend - lcand); + lcand = lcandend; + } else { + nlx = lend - lstart; + lstart = lend; + } + v = NULL; + } } else if (lcand) { - if (lscan < (BUN) (lcandend - lcand) && - lordering * cmp(VALUE(l, lcand[lscan] - l->hseqbase), - v) < 0) { - nlx = binsearch(lcand, l->hseqbase, - l->ttype, lvals, lvars, - lwidth, lscan, - (BUN) (lcandend - lcand), v, - lordering, 0); + if (lscan < (BUN) (lcandend - lcand)) { + if (lvals) { + if (lordering * cmp(VALUE(l, lcand[lscan] - l->hseqbase), v) < 0) { + nlx = binsearch(lcand, l->hseqbase, l->ttype, lvals, lvars, lwidth, lscan, (BUN) (lcandend - lcand), v, lordering, 0); + } + } else { + lval = (oid) ((lng) *(const oid*)v - loff); + if (lordering > 0 ? lcand[lscan] < lval : lcand[lscan] > lval) { + nlx = binsearch(NULL, 0, TYPE_oid, (const char *) lcand, NULL, SIZEOF_OID, 0, lcandend - lcand, &lval, 1, 0); + } _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list