Changeset: 48cf0f430685 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=48cf0f430685 Added Files: gdk/gdk_aggr.c gdk/gdk_calc_private.h Modified Files: gdk/Makefile.ag gdk/gdk_calc.c Branch: default Log Message:
Split off aggregate functions from gdk_calc.c into separate source file. diffs (truncated from 3303 to 300 lines): diff --git a/gdk/Makefile.ag b/gdk/Makefile.ag --- a/gdk/Makefile.ag +++ b/gdk/Makefile.ag @@ -35,7 +35,8 @@ lib_gdk = { gdk_posix.c gdk_logger.c gdk_sample.c \ gdk_private.h gdk_delta.h gdk_logger.h gdk_posix.h \ gdk_system.h gdk_tm.h gdk_storage.h \ - gdk_calc.c gdk_calc.h gdk_calc_compare.h gdk_group.c \ + gdk_calc.c gdk_calc.h gdk_calc_compare.h gdk_calc_private.h \ + gdk_aggr.c gdk_group.c \ bat.feps bat1.feps bat2.feps \ libbat.rc LIBS = ../common/options/libmoptions \ diff --git a/gdk/gdk_aggr.c b/gdk/gdk_aggr.c new file mode 100644 --- /dev/null +++ b/gdk/gdk_aggr.c @@ -0,0 +1,1394 @@ +/* + * The contents of this file are subject to the MonetDB Public License + * Version 1.1 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * http://www.monetdb.org/Legal/MonetDBLicense + * + * Software distributed under the License is distributed on an "AS IS" + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the + * License for the specific language governing rights and limitations + * under the License. + * + * The Original Code is the MonetDB Database System. + * + * The Initial Developer of the Original Code is CWI. + * Portions created by CWI are Copyright (C) 1997-July 2008 CWI. + * Copyright August 2008-2012 MonetDB B.V. + * All Rights Reserved. + */ + +#include "monetdb_config.h" +#include "gdk.h" +#include "gdk_private.h" +#include "gdk_calc_private.h" +#include <math.h> + +/* grouped aggregates + * + * The following functions take two to four input BATs and produce a + * single output BAT. + * + * The input BATs are + * - b, a dense-headed BAT with the values to work on in the tail; + * - g, a dense-headed BAT, aligned with b, with group ids (OID) in + * the tail; + * - e, optional but recommended, a dense-headed BAT with the list of + * group ids in the head(!) (the tail is completely ignored); + * - s, optional, a dense-headed bat with a list of candidate ids in + * the tail. + * + * The tail values of s refer to the head of b and g. Only entries at + * the specified ids are taken into account for the grouped + * aggregates. All other values are ignored. s is compatible with + * the result of BATsubselect(). + * + * If e is not specified, we need to do an extra scan over g to find + * out the range of the group ids that are used. e is defined in such + * a way that it can be either the extents or the histo result from + * BATgroups(). + * + * All functions calculate grouped aggregates. There are as many + * groups as there are entries in e. If e is not specified, the + * number of groups is equal to the difference between the maximum and + * minimum values in g. + * + * If a group is empty, the result for that group is nil. + * + * If there is overflow during the calculation of an aggregate, the + * whole operation fails if abort_on_error is set to non-zero, + * otherwise the result of the group in which the overflow occurred is + * nil. + * + * If skip_nils is non-zero, a nil value in b is ignored, otherwise a + * nil in b results in a nil result for the group. + */ + +/* helper function + * + * This function finds the minimum and maximum group id (and the + * number of groups) and initializes the variables for candidates + * selection. + */ +static gdk_return +initgroupaggr(const BAT *b, const BAT *g, const BAT *e, const BAT *s, + /* outputs: */ + oid *minp, oid *maxp, BUN *np, BUN *startp, BUN *endp, BUN *cntp, + const oid **candp, const oid **candendp) +{ + oid min, max; + BUN i, n; + const oid *gids; + BUN start, end, cnt; + const oid *cand = NULL, *candend = NULL; + + if (b == NULL || !BAThdense(b)) { + GDKerror("BATgroupsum: b must be dense-headed\n"); + return GDK_FAIL; + } + if (g == NULL || !BAThdense(g) || BATcount(b) != BATcount(g) || + (BATcount(b) != 0 && b->hseqbase != g->hseqbase)) { + GDKerror("BATgroupsum: b and g must be aligned\n"); + return GDK_FAIL; + } + assert(BATttype(g) == TYPE_oid); + if (e != NULL && !BAThdense(e)) { + GDKerror("BATgroupsum: e must be dense-headed\n"); + return GDK_FAIL; + } + if (e == NULL) { + /* we need to find out the min and max of g */ + min = oid_nil; /* note that oid_nil > 0! (unsigned) */ + max = 0; + if (BATtdense(g)) { + min = g->tseqbase; + max = g->tseqbase + BATcount(g) - 1; + } else if (g->tsorted) { + gids = (const oid *) Tloc(g, BUNfirst(g)); + /* find first non-nil */ + for (i = 0, n = BATcount(g); i < n; i++, gids++) { + if (*gids != oid_nil) { + min = *gids; + break; + } + } + if (min != oid_nil) { + /* found a non-nil, max must be last + * value (and there is one!) */ + max = * (const oid *) Tloc(g, BUNlast(g) - 1); + } + } else { + /* we'll do a complete scan */ + gids = (const oid *) Tloc(g, BUNfirst(g)); + for (i = 0, n = BATcount(g); i < n; i++) { + if (*gids != oid_nil) { + if (*gids < min) + min = *gids; + if (*gids > max) + max = *gids; + } + gids++; + } + /* note: max < min is possible if all groups + * are nil (or BATcount(g)==0) */ + } + n = max < min ? 0 : max - min + 1; + } else { + n = BATcount(e); + min = e->hseqbase; + max = e->hseqbase + n - 1; + } + *minp = min; + *maxp = max; + *np = n; + + CANDINIT(b, s); + *startp = start; + *endp = end; + *cntp = cnt; + *candp = cand; + *candendp = candend; + + return GDK_SUCCEED; +} + +#define AGGR_SUM(TYPE1, TYPE2) \ + do { \ + const TYPE1 *vals = (const TYPE1 *) Tloc(b, BUNfirst(b)); \ + for (i = start; i < end; i++, vals++) { \ + if (cand) { \ + if (i < *cand - b->hseqbase) { \ + if (gids) \ + gids++; \ + continue; \ + } \ + assert(i == *cand - b->hseqbase); \ + if (++cand == candend) \ + end = i + 1; \ + } \ + if (gids == NULL || \ + (*gids >= min && *gids <= max)) { \ + gid = gids ? *gids - min : (oid) i; \ + if (!(seen[gid >> 5] & (1 << (gid & 0x1F)))) { \ + seen[gid >> 5] |= 1 << (gid & 0x1F); \ + sums[gid] = 0; \ + } \ + if (*vals == TYPE1##_nil) { \ + if (!skip_nils) { \ + sums[gid] = TYPE2##_nil; \ + nils++; \ + } \ + } else if (sums[gid] != TYPE2##_nil) { \ + ADD_WITH_CHECK(TYPE1, *vals, \ + TYPE2, sums[gid], \ + TYPE2, sums[gid], \ + goto overflow); \ + } \ + } \ + if (gids) \ + gids++; \ + } \ + } while (0) + +/* calculate group sums with optional candidates list */ +BAT * +BATgroupsum(BAT *b, BAT *g, BAT *e, BAT *s, int tp, int skip_nils, int abort_on_error) +{ + const oid *gids; + oid gid; + oid min, max; + BUN i, ngrp; + BUN nils = 0; + BAT *bn; + unsigned int *seen; /* bitmask for groups that we've seen */ + BUN start, end, cnt; + const oid *cand = NULL, *candend = NULL; + + if (initgroupaggr(b, g, e, s, &min, &max, &ngrp, + &start, &end, &cnt, &cand, &candend) == GDK_FAIL) + return NULL; + + if (BATcount(b) == 0 || ngrp == 0) { + /* trivial: no sums, so return bat aligned with g with + * nil in the tail */ + bn = BATconstant(tp, ATOMnilptr(tp), ngrp); + BATseqbase(bn, ngrp == 0 ? 0 : min); + return bn; + } + + if ((e == NULL || + (BATcount(e) == BATcount(b) && e->hseqbase == b->hseqbase)) && + (BATtdense(g) || (g->tkey && g->T->nonil))) { + /* trivial: singleton groups, so all results are equal + * to the inputs (but possibly a different type) */ + return BATconvert(b, s, tp, abort_on_error); + } + + /* allocate bitmap for seen group ids */ + seen = GDKzalloc(((ngrp + 31) / 32) * sizeof(int)); + if (seen == NULL) { + GDKerror("BATgroupsum: cannot allocate enough memory\n"); + return NULL; + } + + bn = BATnew(TYPE_void, tp, ngrp); + if (bn == NULL) { + GDKfree(seen); + return NULL; + } + + if (BATtdense(g)) + gids = NULL; + else + gids = (const oid *) Tloc(g, BUNfirst(g) + start); + + switch (ATOMstorage(tp)) { + case TYPE_bte: { + bte *sums = (bte *) Tloc(bn, BUNfirst(bn)); + for (i = 0; i < ngrp; i++) + sums[i] = bte_nil; + switch (ATOMstorage(b->ttype)) { + case TYPE_bte: + AGGR_SUM(bte, bte); + break; + default: + goto unsupported; + } + break; + } + case TYPE_sht: { + sht *sums = (sht *) Tloc(bn, BUNfirst(bn)); + for (i = 0; i < ngrp; i++) + sums[i] = sht_nil; + switch (ATOMstorage(b->ttype)) { + case TYPE_bte: + AGGR_SUM(bte, sht); + break; + case TYPE_sht: + AGGR_SUM(sht, sht); + break; + default: + goto unsupported; + } + break; + } + case TYPE_int: { + int *sums = (int *) Tloc(bn, BUNfirst(bn)); + for (i = 0; i < ngrp; i++) + sums[i] = int_nil; + switch (ATOMstorage(b->ttype)) { + case TYPE_bte: + AGGR_SUM(bte, int); + break; + case TYPE_sht: _______________________________________________ Checkin-list mailing list Checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list