Changeset: a3b13a3f92d4 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=a3b13a3f92d4
Modified Files:
        gdk/gdk_join.c
        gdk/gdk_private.h
        gdk/gdk_search.c
Branch: Jul2017
Log Message:

Merge with Dec2016 branch.


diffs (truncated from 1081 to 300 lines):

diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -205,8 +205,8 @@ joininitresults(BAT **r1p, BAT **r2p, BU
 
 #define VALUE(s, x)    (s##vars ? \
                         s##vars + VarHeapVal(s##vals, (x), s##width) : \
-                        s##vals + ((x) * s##width))
-#define FVALUE(s, x)   (s##vals + ((x) * s##width))
+                        (const char *) s##vals + ((x) * s##width))
+#define FVALUE(s, x)   ((const char *) s##vals + ((x) * s##width))
 
 #define APPEND(b, o)           (((oid *) b->theap.base)[b->batCount++] = (o))
 
@@ -762,21 +762,6 @@ mergejoin_void(BAT *r1, BAT *r2, BAT *l,
        return GDK_FAIL;
 }
 
-/* Perform a "merge" join on l and r (if both are sorted) with
- * optional candidate lists, or join using binary search on r if l is
- * not sorted.  The return BATs have already been created by the
- * caller.
- *
- * If nil_matches is set, nil values are treated as ordinary values
- * that can match; otherwise nil values never match.
- *
- * If nil_on_miss is set, a nil value is returned in r2 if there is no
- * match in r for a particular value in l (left outer join).
- *
- * If semi is set, only a single set of values in r1/r2 is returned if
- * there is a match of l in r, no matter how many matches there are in
- * r; otherwise all matches are returned.
- */
 static gdk_return
 mergejoin_int(BAT *r1, BAT *r2, BAT *l, BAT *r,
              int nil_matches, BUN maxsize, lng t0, int swapped)
@@ -1371,22 +1356,46 @@ mergejoin_lng(BAT *r1, BAT *r2, BAT *l, 
        return GDK_FAIL;
 }
 
+/* Perform a "merge" join on l and r (if both are sorted) with
+ * optional candidate lists, or join using binary search on r if l is
+ * not sorted.  The return BATs have already been created by the
+ * caller.
+ *
+ * If nil_matches is set, nil values are treated as ordinary values
+ * that can match; otherwise nil values never match.
+ *
+ * If nil_on_miss is set, a nil value is returned in r2 if there is no
+ * match in r for a particular value in l (left outer join).
+ *
+ * If semi is set, only a single set of values in r1/r2 is returned if
+ * there is a match of l in r, no matter how many matches there are in
+ * r; otherwise all matches are returned.
+ *
+ * maxsize is the absolute maximum size the output BATs can become (if
+ * they were to become larger, we have a bug).
+ *
+ * t0 and swapped are only for debugging (ALGOMASK set in GDKdebug).
+ */
 static gdk_return
 mergejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr,
          int nil_matches, int nil_on_miss, int semi, int only_misses,
          BUN maxsize, lng t0, int swapped)
 {
-       BUN lstart, lend, lcnt;
+       BUN lstart, lend;
        const oid *lcand, *lcandend;
-       BUN rstart, rend, rcnt, rstartorig;
+       BUN rstart, rend, rstartorig;
+       BUN lcnt, rcnt;         /* not actively used, but needed for CANDINIT */
        const oid *rcand, *rcandend, *rcandorig;
+       /* [lr]scan determine how far we look ahead in l/r in order to
+        * decide whether we want to do a binary search or a scan */
        BUN lscan, rscan;
-       const char *lvals, *rvals;
-       const char *lvars, *rvars;
-       int lwidth, rwidth;
+       const void *lvals, *rvals; /* the values of l/r (NULL if dense) */
+       const char *lvars, *rvars; /* the indirect values (NULL if fixed size) 
*/
+       int lwidth, rwidth;        /* width of the values */
        const void *nil = ATOMnilptr(l->ttype);
        int (*cmp)(const void *, const void *) = ATOMcompare(l->ttype);
-       const char *v, *prev = NULL;
+       const void *v;          /* points to value under consideration */
+       const void *prev = NULL;
        BUN nl, nr;
        BUN total;              /* number of rows in l we scan */
        int insert_nil;
@@ -1400,10 +1409,10 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
         * l/r: it determines the comparison function used */
        int lordering, rordering;
        oid lv;
-       BUN i;
+       BUN i, j;               /* counters */
        int lskipped = 0;       /* whether we skipped values in l */
-       lng loff = 0, roff = 0;
-       oid lval = oid_nil, rval = oid_nil;
+       lng loff = 0, roff = 0; /* set if l/r is dense */
+       oid lval = oid_nil, rval = oid_nil; /* temporary space to point v to */
 
        if (sl == NULL && sr == NULL && !nil_on_miss &&
            !semi && !only_misses && l->tsorted && r->tsorted && r2 != NULL) {
@@ -1449,8 +1458,8 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
        CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend);
        CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend);
        total = lcand ? (BUN) (lcandend - lcand) : lend - lstart;
-       lvals = l->ttype == TYPE_void ? NULL : (const char *) Tloc(l, 0);
-       rvals = r->ttype == TYPE_void ? NULL : (const char *) Tloc(r, 0);
+       lvals = BATtvoid(l) ? NULL : Tloc(l, 0);
+       rvals = BATtvoid(r) ? NULL : Tloc(r, 0);
        if (l->tvarsized && l->ttype) {
                assert(r->tvarsized && r->ttype);
                lvars = l->tvheap->base;
@@ -1468,14 +1477,14 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
        if (lstart == lend ||
            rstart == rend ||
            (!nil_matches &&
-            ((l->ttype == TYPE_void && l->tseqbase == oid_nil) ||
-             (r->ttype == TYPE_void && r->tseqbase == oid_nil))) ||
-           (l->ttype == TYPE_void && l->tseqbase == oid_nil &&
+            ((BATtvoid(l) && l->tseqbase == oid_nil) ||
+             (BATtvoid(r) && r->tseqbase == oid_nil))) ||
+           (BATtvoid(l) && l->tseqbase == oid_nil &&
             (r->tnonil ||
-             (r->ttype == TYPE_void && r->tseqbase != oid_nil))) ||
-           (r->ttype == TYPE_void && r->tseqbase == oid_nil &&
+             (BATtvoid(r) && r->tseqbase != oid_nil))) ||
+           (BATtvoid(r) && r->tseqbase == oid_nil &&
             (l->tnonil ||
-             (l->ttype == TYPE_void && l->tseqbase != oid_nil)))) {
+             (BATtvoid(l) && l->tseqbase != oid_nil)))) {
                /* there are no matches */
                return nomatch(r1, r2, l, r, lstart, lend, lcand, lcandend,
                               nil_on_miss, only_misses, "mergejoin", t0);
@@ -1490,13 +1499,13 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                        nl >>= 1;
                equal_order = (l->tsorted && r->tsorted) ||
                        (l->trevsorted && r->trevsorted &&
-                        l->ttype != TYPE_void && r->ttype != TYPE_void);
+                        !BATtvoid(l) && !BATtvoid(r));
                lordering = l->tsorted && (r->tsorted || !equal_order) ? 1 : -1;
                rordering = equal_order ? lordering : -lordering;
        } else {
                /* if l not sorted, we will always use binary search
                 * on r */
-               assert(l->ttype != TYPE_void); /* void is always sorted */
+               assert(!BATtvoid(l)); /* void is always sorted */
                lscan = 0;
                equal_order = 1;
                lordering = 1;
@@ -1516,34 +1525,49 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
             rscan++)
                nl >>= 1;
 
-       if (l->ttype == TYPE_void) {
+       if (BATtvoid(l)) {
+               assert(lvals == NULL);
                if (lcand) {
                        lstart = 0;
                        lend = (BUN) (lcandend - lcand);
-                       lvals = (const char *) lcand;
-                       lcand = NULL;
-                       lwidth = SIZEOF_OID;
                }
                if (l->tseqbase == oid_nil)
                        loff = lng_nil;
                else
                        loff = (lng) l->tseqbase - (lng) l->hseqbase;
        }
-       if (r->ttype == TYPE_void) {
+       if (BATtvoid(r)) {
+               assert(rvals == NULL);
                if (rcand) {
                        rstart = 0;
                        rend = (BUN) (rcandend - rcand);
-                       rvals = (const char *) rcand;
-                       rcand = NULL;
-                       rwidth = SIZEOF_OID;
                }
                if (r->tseqbase == oid_nil)
                        roff = lng_nil;
                else
                        roff = (lng) r->tseqbase - (lng) r->hseqbase;
        }
-       assert(lvals != NULL || lcand == NULL);
-       assert(rvals != NULL || rcand == NULL);
+       assert(loff == 0 || lvals == NULL);
+       assert(roff == 0 || rvals == NULL);
+       /* At this point the various variables that help us through
+        * the algorithm have been set.  The table explains them.  The
+        * first two columns are the inputs, the next three columns
+        * are the variables, the final two columns indicate how the
+        * variables can be used.
+        *
+        * l/r    sl/sr | vals  cand  off | result   value being matched
+        * -------------+-----------------+----------------------------------
+        * dense  NULL  | NULL  NULL  set | i        off==nil?nil:i+off
+        * dense  dense | NULL  NULL  set | i        off==nil?nil:i+off
+        * dense  set   | NULL  set   set | cand[i]  off==nil?nil:cand[i]+off
+        * set    NULL  | set   NULL  0   | i        vals[i]
+        * set    dense | set   NULL  0   | i        vals[i]
+        * set    set   | set   set   0   | cand[i]  vals[cand[i]]
+        *
+        * If {l,r}off is lng_nil, all values in the corresponding bat
+        * are oid_nil because the bat has type VOID and the tseqbase
+        * is nil.
+        */
 
        rcandorig = rcand;
        rstartorig = rstart;
@@ -1571,34 +1595,36 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                         * value in r.
                         * The next value to match in r is the first
                         * if equal_order is set, the last
-                        * otherwise. */
+                        * otherwise.
+                        * When skipping over values in l, we count
+                        * how many we skip in nlx.  We need this in
+                        * case only_misses or nil_on_miss is set, and
+                        * to properly set the tdense property in the
+                        * first output BAT. */
                        BUN nlx = 0; /* number of non-matching values in l */
 
                        if (rcand) {
-                               if (rcand == rcandend)
-                                       v = NULL;
-                               else
+                               if (rcand == rcandend) {
+                                       v = NULL; /* no more values */
+                               } else if (roff != 0) {
+                                       rval = roff == lng_nil ? oid_nil : 
(oid) ((lng) (equal_order ? rcand[0] : rcandend[-1]) + roff);
+                                       v = &rval;
+                               } else {
                                        v = VALUE(r, (equal_order ? rcand[0] : 
rcandend[-1]) - r->hseqbase);
+                               }
                        } else {
                                if (rstart == rend) {
                                        v = NULL;
                                } else if (rvals) {
                                        v = VALUE(r, equal_order ? rstart : 
rend - 1);
-                                       if (roff == lng_nil) {
-                                               rval = oid_nil;
-                                               v = (const char *) &rval;
-                                       } else if (roff != 0) {
-                                               rval = (oid) (*(const oid *)v + 
roff);
-                                               v = (const char *) &rval;
-                                       }
                                } else {
                                        if (roff == lng_nil)
                                                rval = oid_nil;
                                        else if (equal_order)
-                                               rval = rstart + r->tseqbase;
+                                               rval = (oid) ((lng) rstart + 
r->tseqbase);
                                        else
-                                               rval = rend - 1 + r->tseqbase;
-                                       v = (const char *) &rval;
+                                               rval = (oid) ((lng) rend - 1 + 
r->tseqbase);
+                                       v = &rval;
                                }
                        }
                        /* here, v points to next value in r, or if
@@ -1611,46 +1637,50 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                                        nlx = lend - lstart;
                                        lstart = lend;
                                }
+                       } else if (loff == lng_nil) {
+                               /* all l values are NIL, and the type is OID */
+                               if (* (oid *) v != oid_nil) {
+                                       /* value we're looking at in r
+                                        * is not NIL, so we match
+                                        * nothing */
+                                       if (lcand) {
+                                               nlx = (BUN) (lcandend - lcand);
+                                               lcand = lcandend;
+                                       } else {
+                                               nlx = lend - lstart;
+                                               lstart = lend;
+                                       }
+                                       v = NULL;
+                               }
                        } else if (lcand) {
-                               if (lscan < (BUN) (lcandend - lcand) &&
-                                   lordering * cmp(VALUE(l, lcand[lscan] - 
l->hseqbase),
-                                                   v) < 0) {
-                                       nlx = binsearch(lcand, l->hseqbase,
-                                                       l->ttype, lvals, lvars,
-                                                       lwidth, lscan,
-                                                       (BUN) (lcandend - 
lcand), v,
-                                                       lordering, 0);
+                               if (lscan < (BUN) (lcandend - lcand)) {
+                                       if (lvals) {
+                                               if (lordering * cmp(VALUE(l, 
lcand[lscan] - l->hseqbase), v) < 0) {
+                                                       nlx = binsearch(lcand, 
l->hseqbase, l->ttype, lvals, lvars, lwidth, lscan, (BUN) (lcandend - lcand), 
v, lordering, 0);
+                                               }
+                                       } else {
+                                               lval = (oid) ((lng) *(const 
oid*)v - loff);
+                                               if (lordering > 0 ? 
lcand[lscan] < lval : lcand[lscan] > lval) {
+                                                       nlx = binsearch(NULL, 
0, TYPE_oid, (const char *) lcand, NULL, SIZEOF_OID, 0, lcandend - lcand, 
&lval, 1, 0);
+                                               }
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to