Changeset: 5be0f60d8000 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=5be0f60d8000 Modified Files: gdk/gdk_group.c Branch: Feb2013 Log Message:
More efficient algorithm to subgroup sorted values. See the comments for an explanation of the algorithm. diffs (108 lines): diff --git a/gdk/gdk_group.c b/gdk/gdk_group.c --- a/gdk/gdk_group.c +++ b/gdk/gdk_group.c @@ -392,10 +392,16 @@ BATgroup_internal(BAT **groups, BAT **ex *groups = gn; } else if (b->tsorted || b->trevsorted) { BUN i, j; + BUN *pgrp; /* for each value, we need to scan all previous equal * values (a consecutive, possibly empty, range) to - * see if we can find one in the same old group */ + * see if we can find one in the same old group + * + * we do this by maintaining for each old group the + * last time we saw that group, so if the last time we + * saw the old group of the current value is within + * this range, we can reuse the new group */ ALGODEBUG fprintf(stderr, "#BATgroup(b=%s#" BUNFMT "," "g=%s#" BUNFMT "," "e=%s#" BUNFMT "," @@ -406,12 +412,32 @@ BATgroup_internal(BAT **groups, BAT **ex e ? BATgetId(e) : "NULL", e ? BATcount(e) : 0, h ? BATgetId(h) : "NULL", h ? BATcount(h) : 0, subsorted); + /* determine how many old groups there are */ + if (e) { + j = BATcount(e) + (BUN) e->hseqbase; + } else if (h) { + j = BATcount(h) + (BUN) h->hseqbase; + } else { + oid m = 0; + for (i = 0, j = BATcount(g); i < j; i++) + if (grps[i] > m) + m = grps[i]; + j = (BUN) m + 1; + } + /* array to maintain last time we saw each old group */ + pgrp = GDKmalloc(sizeof(BUN) * j); + if (pgrp == NULL) + goto error; + /* initialize to impossible position */ + memset(pgrp, ~0, sizeof(BUN) * j); + pv = BUNtail(bi, BUNfirst(b)); ngrps[0] = ngrp; if (extents) exts[0] = b->hseqbase; if (histo) cnts[0] = 1; + pgrp[grps[0]] = BUNfirst(b); ngrp++; /* the next group to be assigned */ gn->tsorted = 1; /* be optimistic */ for (j = r = BUNfirst(b), p = r + 1, q = r + BATcount(b); @@ -419,21 +445,25 @@ BATgroup_internal(BAT **groups, BAT **ex p++) { v = BUNtail(bi, p); if (cmp(v, pv) == 0) { - /* range [j, p) is all same value, - * find one in the same group */ - for (i = j; i < p; i++) { - if (grps[i - r] == grps[p - r]) { - oid grp = ngrps[i - r]; - ngrps[p - r] = grp; - if (histo) - cnts[grp]++; - if (gn->tsorted && - grp != ngrp - 1) - gn->tsorted = 0; - break; - } - } - if (i < p) { + /* range [j, p) is all same value */ + /* i is position where we saw p's old + * group last */ + i = pgrp[grps[p - r]]; + /* p is new position where we saw this + * group */ + pgrp[grps[p - r]] = p; + if (j <= i && i < p) { + /* i is position of equal + * value in same old group as + * p, so p gets same new group + * as i */ + oid grp = ngrps[i - r]; + ngrps[p - r] = grp; + if (histo) + cnts[grp]++; + if (gn->tsorted && + grp != ngrp - 1) + gn->tsorted = 0; /* we found the value/group * combination, go to next * value */ @@ -443,10 +473,12 @@ BATgroup_internal(BAT **groups, BAT **ex /* value differs from previous value */ j = p; pv = v; + pgrp[grps[p - r]] = p; } /* start a new group */ GRPnotfound(); } + GDKfree(pgrp); } else if (b->T->hash) { /* we already have a hash table on b */ ALGODEBUG fprintf(stderr, "#BATgroup(b=%s#" BUNFMT "," _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list