Changeset: 57eb02a13b11 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=57eb02a13b11
Modified Files:
        clients/Tests/exports.stable.out
        gdk/gdk.h
        gdk/gdk_join.c
        monetdb5/modules/kernel/algebra.mx
Branch: default
Log Message:

Initial version of a BATsubhashjoin, the second of the new join algorithms.
Interfaces, especially at the MAL level are likely to change.  This
commit is just to store the work in the repository.


diffs (truncated from 373 to 300 lines):

diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -194,6 +194,7 @@ BAT *BATsort(BAT *b);
 BAT *BATsort_rev(BAT *b);
 BAT *BATssort(BAT *b);
 BAT *BATssort_rev(BAT *b);
+gdk_return BATsubhashjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT *sl, BAT 
*sr);
 gdk_return BATsubmergejoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT *sl, BAT 
*sr);
 BAT *BATsubselect(BAT *b, BAT *s, const void *tl, const void *th, int li, int 
hi, int anti);
 gdk_return BATsubsort(BAT **sorted, BAT **order, BAT **groups, BAT *b, BAT *o, 
BAT *g, int reverse, int stable);
@@ -826,6 +827,8 @@ str ALGssort(int *result, int *bid);
 str ALGssort_rev(int *result, int *bid);
 str ALGstdev(dbl *res, int *bid);
 str ALGstdevp(dbl *res, int *bid);
+str ALGsubhashjoin(bat *r1, bat *r2, bat *l, bat *r);
+str ALGsubhashjoin4(bat *r1, bat *r2, bat *l, bat *r, bat *sl, bat *sr);
 str ALGsubmergejoin(bat *r1, bat *r2, bat *l, bat *r);
 str ALGsubmergejoin4(bat *r1, bat *r2, bat *l, bat *r, bat *sl, bat *sr);
 str ALGsubsample(int *result, int *bid, int *param);
diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -3166,6 +3166,7 @@ gdk_export BAT *BATouterjoin(BAT *l, BAT
 gdk_export BAT *BATcross(BAT *l, BAT *r);
 
 gdk_export gdk_return BATsubmergejoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT 
*sl, BAT *sr);
+gdk_export gdk_return BATsubhashjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT 
*sl, BAT *sr);
 
 gdk_export BAT *BATslice(BAT *b, BUN low, BUN high);
 gdk_export BAT *BATfetch(BAT *b, BAT *s);
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -466,3 +466,258 @@ BATsubmergejoin(BAT **r1p, BAT **r2p, BA
        *r2p = r2;
        return mergejoin(r1, r2, l, r, sl, sr, 0, 0, 0);
 }
+
+/* binary search in a candidate list, return 1 if found, 0 if not */
+static int
+binsearchcand(const oid *cand, BUN lo, BUN hi, oid v)
+{
+       BUN mid;
+
+       --hi;                   /* now hi is inclusive */
+       if (v < cand[lo] || v > cand[hi])
+               return 0;
+       while (hi > lo) {
+               mid = (lo + hi) / 2;
+               if (cand[mid] == v)
+                       return 1;
+               if (cand[mid] < v)
+                       lo = mid + 1;
+               else
+                       hi = mid - 1;
+       }
+       return cand[lo] == v;
+}
+
+static gdk_return
+hashjoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, 
int nil_on_miss, int semi)
+{
+       BUN lstart, lend, lcnt;
+       const oid *lcand = NULL, *lcandend = NULL;
+       BUN rstart, rend, rcnt;
+       const oid *rcand = NULL, *rcandend = NULL;
+       oid lo, ro;
+       BATiter ri;
+       BUN rb;
+       wrd off;
+       BUN nr, nrcand, newcap;
+       const char *lvals;
+       const char *lvars;
+       int lwidth;
+       const void *nil = ATOMnilptr(l->ttype);
+       int (*cmp)(const void *, const void *) = BATatoms[l->ttype].atomCmp;
+       const char *v;
+
+       assert(BAThdense(l));
+       assert(BAThdense(r));
+       assert(l->ttype == r->ttype);
+       assert(sl == NULL || sl->tsorted);
+       assert(sr == NULL || sr->tsorted);
+
+       CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend);
+       CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend);
+       lwidth = l->T->width;
+       lvals = (const char *) Tloc(l, BUNfirst(l));
+       if (l->tvarsized && l->ttype) {
+               assert(r->tvarsized && r->ttype);
+               lvars = l->T->vheap->base;
+       } else {
+               assert(!r->tvarsized || !r->ttype);
+               lvars = NULL;
+       }
+       off = r->hseqbase - BUNfirst(r);
+
+       /* set basic properties, they will be adjusted if necessary
+        * later on */
+       r1->tsorted = 1;
+       r1->trevsorted = 1;
+       r1->tkey = 1;
+       r1->T->nil = 0;
+       r1->T->nonil = 1;
+       /* r2 is not likely to be sorted (although it is certainly
+        * possible) */
+       r2->tsorted = 0;
+       r2->trevsorted = 0;
+       r2->T->nil = 0;
+       r2->T->nonil = 1;
+       /* we may miss "key" opportunities: duplicates in l and r
+        * might not match a value on the other side */
+       r2->tkey = l->tkey && r->tkey;
+
+       if (lstart == lend || (!nil_on_miss && rstart == rend)) {
+               /* nothing to do: there are no matches */
+               return GDK_SUCCEED;
+       }
+
+       /* hashes work on HEAD column */
+       r = BATmirror(r);
+       if (BATprepareHash(r))
+               goto bailout;
+       ri = bat_iterator(r);
+       nrcand = (BUN) (rcandend - rcand);
+
+       if (lcand) {
+               while (lcand < lcandend) {
+                       lo = *lcand++;
+                       v = VALUE(l, lo - l->hseqbase);
+                       if (!nil_matches && cmp(v, nil) == 0)
+                               continue;
+                       nr = 0;
+                       if (rcand) {
+                               HASHloop(ri, r->H->hash, rb, v) {
+                                       ro = (oid) rb + off;
+                                       if (!binsearchcand(rcand, 0, nrcand, 
ro))
+                                               continue;
+                                       if (BUNlast(r1) == BATcapacity(r1)) {
+                                               newcap = BATgrows(r1);
+                                               r1 = BATextend(r1, newcap);
+                                               r2 = BATextend(r2, newcap);
+                                               if (r1 == NULL || r2 == NULL)
+                                                       goto bailout;
+                                               assert(BATcapacity(r1) == 
BATcapacity(r2));
+                                       }
+                                       APPEND(r1, lo);
+                                       APPEND(r2, ro);
+                                       nr++;
+                                       if (semi)
+                                               break;
+                               }
+                       } else {
+                               HASHloop(ri, r->H->hash, rb, v) {
+                                       ro = (oid) rb + off;
+                                       if (ro < rstart || ro >= rend)
+                                               continue;
+                                       if (BUNlast(r1) == BATcapacity(r1)) {
+                                               newcap = BATgrows(r1);
+                                               r1 = BATextend(r1, newcap);
+                                               r2 = BATextend(r2, newcap);
+                                               if (r1 == NULL || r2 == NULL)
+                                                       goto bailout;
+                                               assert(BATcapacity(r1) == 
BATcapacity(r2));
+                                       }
+                                       APPEND(r1, lo);
+                                       APPEND(r2, ro);
+                                       nr++;
+                                       if (semi)
+                                               break;
+                               }
+                       }
+                       if (nr == 0 && nil_on_miss) {
+                               nr = 1;
+                               r2->T->nil = 1;
+                               r2->T->nonil = 0;
+                               r2->tkey = 0;
+                               if (BUNlast(r1) == BATcapacity(r1)) {
+                                       newcap = BATgrows(r1);
+                                       r1 = BATextend(r1, newcap);
+                                       r2 = BATextend(r2, newcap);
+                                       if (r1 == NULL || r2 == NULL)
+                                               goto bailout;
+                                       assert(BATcapacity(r1) == 
BATcapacity(r2));
+                               }
+                               APPEND(r1, lo);
+                               APPEND(r2, oid_nil);
+                       } else if (nr > 1)
+                               r1->tkey = 0;
+                       if (nr > 0 && BATcount(r1) > nr)
+                               r1->trevsorted = 0;
+               }
+       } else {
+               for (lo = lstart - BUNfirst(l) + l->hseqbase; lstart < lend; 
lo++) {
+                       v = VALUE(l, lstart);
+                       lstart++;
+                       if (!nil_matches && cmp(v, nil) == 0)
+                               continue;
+                       nr = 0;
+                       if (rcand) {
+                               HASHloop(ri, r->H->hash, rb, v) {
+                                       ro = (oid) rb + off;
+                                       if (!binsearchcand(rcand, 0, nrcand, 
ro))
+                                               continue;
+                                       if (BUNlast(r1) == BATcapacity(r1)) {
+                                               newcap = BATgrows(r1);
+                                               r1 = BATextend(r1, newcap);
+                                               r2 = BATextend(r2, newcap);
+                                               if (r1 == NULL || r2 == NULL)
+                                                       goto bailout;
+                                               assert(BATcapacity(r1) == 
BATcapacity(r2));
+                                       }
+                                       APPEND(r1, lo);
+                                       APPEND(r2, ro);
+                                       nr++;
+                                       if (semi)
+                                               break;
+                               }
+                       } else {
+                               HASHloop(ri, r->H->hash, rb, v) {
+                                       ro = (oid) rb + off;
+                                       if (ro < rstart || ro >= rend)
+                                               continue;
+                                       if (BUNlast(r1) == BATcapacity(r1)) {
+                                               newcap = BATgrows(r1);
+                                               r1 = BATextend(r1, newcap);
+                                               r2 = BATextend(r2, newcap);
+                                               if (r1 == NULL || r2 == NULL)
+                                                       goto bailout;
+                                               assert(BATcapacity(r1) == 
BATcapacity(r2));
+                                       }
+                                       APPEND(r1, lo);
+                                       APPEND(r2, ro);
+                                       nr++;
+                                       if (semi)
+                                               break;
+                               }
+                       }
+                       if (nr == 0 && nil_on_miss) {
+                               nr = 1;
+                               r2->T->nil = 1;
+                               r2->T->nonil = 0;
+                               if (BUNlast(r1) == BATcapacity(r1)) {
+                                       newcap = BATgrows(r1);
+                                       r1 = BATextend(r1, newcap);
+                                       r2 = BATextend(r2, newcap);
+                                       if (r1 == NULL || r2 == NULL)
+                                               goto bailout;
+                                       assert(BATcapacity(r1) == 
BATcapacity(r2));
+                               }
+                               APPEND(r1, lo);
+                               APPEND(r2, oid_nil);
+                       } else if (nr > 1)
+                               r1->tkey = 0;
+                       if (nr > 0 && BATcount(r1) > nr)
+                               r1->trevsorted = 0;
+               }
+       }
+       assert(BATcount(r1) == BATcount(r2));
+       if (BATcount(r1) <= 1) {
+               r1->tsorted = 1;
+               r1->trevsorted = 1;
+               r1->tkey = 1;
+               r2->tsorted = 1;
+               r2->trevsorted = 1;
+               r2->tkey = 1;
+       }
+       return GDK_SUCCEED;
+
+  bailout:
+       if (r1)
+               BBPreclaim(r1);
+       if (r2)
+               BBPreclaim(r2);
+       return GDK_FAIL;
+}
+
+gdk_return
+BATsubhashjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr)
+{
+       BAT *r1, *r2;
+
+       *r1p = NULL;
+       *r2p = NULL;
+       if (joinparamcheck(l, r, sl, sr, "BATsubleftjoin") == GDK_FAIL)
+               return GDK_FAIL;
+       if (joininitresults(&r1, &r2, sl ? BATcount(sl) : BATcount(l), 
"BATsubhashjoin") == GDK_FAIL)
+               return GDK_FAIL;
+       *r1p = r1;
+       *r2p = r2;
+       return hashjoin(r1, r2, l, r, sl, sr, 0, 0, 0);
+}
diff --git a/monetdb5/modules/kernel/algebra.mx 
b/monetdb5/modules/kernel/algebra.mx
--- a/monetdb5/modules/kernel/algebra.mx
+++ b/monetdb5/modules/kernel/algebra.mx
@@ -712,6 +712,13 @@ command submergejoin(l:bat[:oid,:any_1],
 address ALGsubmergejoin4
 comment "Merge join with candidate lists";
 
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to