Changeset: b7d407bb4c1b for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=b7d407bb4c1b Modified Files: gdk/gdk_analytic_func.c gdk/gdk_calc.h gdk/gdk_string.c sql/backends/monet5/sql_rank.c sql/backends/monet5/sql_rank.h sql/backends/monet5/sql_rank.mal sql/backends/monet5/sql_rank.mal.sh sql/backends/monet5/sql_upgrades.c sql/common/sql_types.c sql/scripts/51_sys_schema_extension.sql sql/test/group-concat/Tests/groupconcat05.stable.out Branch: statistics-analytics Log Message:
Implemented group_concat as a window function. Also fixed null case handling in the aggregate version diffs (truncated from 538 to 300 lines): diff --git a/gdk/gdk_analytic_func.c b/gdk/gdk_analytic_func.c --- a/gdk/gdk_analytic_func.c +++ b/gdk/gdk_analytic_func.c @@ -1018,8 +1018,7 @@ GDKanalytical##OP(BAT *r, BAT *b, BAT *s } \ if (BUNappend(r, curval, false) != GDK_SUCCEED) \ goto allocation_error; \ - if (atomcmp(curval, nil) == 0) \ - has_nils = true; \ + has_nils |= atomcmp(curval, nil) == 0; \ } \ } \ } \ diff --git a/gdk/gdk_calc.h b/gdk/gdk_calc.h --- a/gdk/gdk_calc.h +++ b/gdk/gdk_calc.h @@ -161,5 +161,6 @@ gdk_export BAT *BATgroupcovariance_sampl gdk_export BAT *BATgroupcovariance_population(BAT *b1, BAT *b2, BAT *g, BAT *e, BAT *s, int tp, bool skip_nils, bool abort_on_error); gdk_export BAT *BATgroupcorrelation(BAT *b1, BAT *b2, BAT *g, BAT *e, BAT *s, int tp, bool skip_nils, bool abort_on_error); -gdk_export BAT *BATgroupstr_group_concat(BAT *b, BAT *g, BAT *e, BAT *s, BAT *sep, bool skip_nils, bool abort_on_error, const char *separator); -gdk_export gdk_return BATstr_group_concat(ValPtr res, BAT *b, BAT *s, BAT *sep, bool skip_nils, bool abort_on_error, bool nil_if_empty, const char *separator); +gdk_export BAT *BATgroupstr_group_concat(BAT *b, BAT *g, BAT *e, BAT *s, BAT *sep, bool skip_nils, bool abort_on_error, const char *restrict separator); +gdk_export gdk_return BATstr_group_concat(ValPtr res, BAT *b, BAT *s, BAT *sep, bool skip_nils, bool abort_on_error, bool nil_if_empty, const char *restrict separator); +gdk_export gdk_return GDKanalytical_str_group_concat(BAT *r, BAT *b, BAT *sep, BAT *s, BAT *e, const char *restrict separator); diff --git a/gdk/gdk_string.c b/gdk/gdk_string.c --- a/gdk/gdk_string.c +++ b/gdk/gdk_string.c @@ -814,7 +814,7 @@ static gdk_return concat_strings(BAT **bnp, ValPtr pt, BAT *b, oid seqb, BUN ngrp, struct canditer *restrict ci, BUN ncand, const oid *restrict gids, oid min, oid max, bool skip_nils, - BAT *sep, const char *separator, BUN *has_nils) + BAT *sep, const char *restrict separator, BUN *has_nils) { oid gid; BUN i, p, nils = 0; @@ -866,15 +866,22 @@ concat_strings(BAT **bnp, ValPtr pt, BAT p = canditer_next(ci) - seqb; s = BUNtvar(bi, p); sl = BUNtvar(bis, p); - if (GDK_STRNIL(s) || GDK_STRNIL(sl)) { + if (GDK_STRNIL(s)) { if (!skip_nils) { nils = 1; break; } } else { single_length += strlen(s); - if (!empty) - single_length += strlen(sl); + if (!empty) { + if (GDK_STRNIL(sl)) { + if (!skip_nils) { + nils = 1; + break; + } + } else + single_length += strlen(sl); + } empty = false; } } @@ -907,9 +914,9 @@ concat_strings(BAT **bnp, ValPtr pt, BAT p = canditer_next(ci) - seqb; s = BUNtvar(bi, p); sl = BUNtvar(bis, p); - if (GDK_STRNIL(s) || GDK_STRNIL(sl)) + if (GDK_STRNIL(s)) continue; - if (!empty) { + if (!empty && !GDK_STRNIL(sl)) { next_sep_len = strlen(sl); memcpy(single_str + offset, sl, next_sep_len); offset += next_sep_len; @@ -921,11 +928,7 @@ concat_strings(BAT **bnp, ValPtr pt, BAT } } - if (empty) { - if (!(single_str = GDKstrdup(str_nil))) - return GDK_FAIL; - } else - single_str[offset] = '\0'; + single_str[offset] = '\0'; if (bn) { if (BUNappend(bn, single_str, false) != GDK_SUCCEED) { GDKfree(single_str); @@ -989,10 +992,14 @@ concat_strings(BAT **bnp, ValPtr pt, BAT continue; s = BUNtvar(bi, i); sl = BUNtvar(bis, i); - if (!GDK_STRNIL(s) && !GDK_STRNIL(sl)) { - next_sep_len = strlen(sl); - lengths[gid] += strlen(s) + next_sep_len; - lastseplength[gid] = next_sep_len; + if (!GDK_STRNIL(s)) { + lengths[gid] += strlen(s); + if (!GDK_STRNIL(sl)) { + next_sep_len = strlen(sl); + lengths[gid] += next_sep_len; + lastseplength[gid] = next_sep_len; + } else + lastseplength[gid] = 0; astrings[gid] = NULL; } else if (!skip_nils) { nils++; @@ -1059,9 +1066,9 @@ concat_strings(BAT **bnp, ValPtr pt, BAT if (astrings[gid]) { s = BUNtvar(bi, i); sl = BUNtvar(bis, i); - if (GDK_STRNIL(s) || GDK_STRNIL(sl)) + if (GDK_STRNIL(s)) continue; - if (astrings[gid][lengths[gid]]) { + if (astrings[gid][lengths[gid]] && !GDK_STRNIL(sl)) { next_sep_len = strlen(sl); memcpy(astrings[gid] + lengths[gid], sl, next_sep_len); lengths[gid] += next_sep_len; @@ -1109,7 +1116,7 @@ finish: gdk_return BATstr_group_concat(ValPtr res, BAT *b, BAT *s, BAT *sep, bool skip_nils, - bool abort_on_error, bool nil_if_empty, const char *separator) + bool abort_on_error, bool nil_if_empty, const char *restrict separator) { BUN ncand; struct canditer ci; @@ -1138,7 +1145,7 @@ BATstr_group_concat(ValPtr res, BAT *b, BAT * BATgroupstr_group_concat(BAT *b, BAT *g, BAT *e, BAT *s, BAT *sep, bool skip_nils, - bool abort_on_error, const char *separator) + bool abort_on_error, const char *restrict separator) { BAT *bn = NULL; oid min, max; @@ -1186,3 +1193,122 @@ BATgroupstr_group_concat(BAT *b, BAT *g, return bn; } + +gdk_return +GDKanalytical_str_group_concat(BAT *r, BAT *b, BAT *sep, BAT *s, BAT *e, const char *restrict separator) +{ + BUN i = 0, j, l, cnt = BATcount(b); + lng *restrict start, *restrict end; + BATiter bi, bis; + str sb, sl, single_str = NULL, next_single_str; + bool empty; + size_t separator_length = 0, next_group_length, max_group_length = 0, next_length, offset, next_sep_len; + + assert(s && e && ((sep && !separator && BATcount(b) == BATcount(sep)) || (!sep && separator))); + start = (lng *) Tloc(s, 0); + end = (lng *) Tloc(e, 0); + + if (b->ttype != TYPE_str || r->ttype != TYPE_str || (sep && sep->ttype != TYPE_str)) { + GDKerror("BATgroupstr_group_concat: only string type is supported\n"); + return GDK_FAIL; + } + + if (sep && BATcount(sep) == 1) { /* Only one element in sep */ + bi = bat_iterator(sep); + separator = BUNtvar(bi, 0); + sep = NULL; + } + + if (sep) + bis = bat_iterator(sep); + else + separator_length = strlen(separator); + + bi = bat_iterator(b); + for (; i < cnt; i++) { + l = end[i]; + empty = true; + next_group_length = next_length = offset = 0; + + for (j = start[i]; j < l; j++) { + sb = BUNtvar(bi, j); + + if (separator) { + if (!GDK_STRNIL(sb)) { + next_group_length += strlen(sb); + if (!empty) + next_group_length += separator_length; + empty = false; + } + } else { /* sep case */ + sl = BUNtvar(bis, j); + + if (!GDK_STRNIL(sb)) { + next_group_length += strlen(sb); + if (!empty && !GDK_STRNIL(sl)) + next_group_length += strlen(sl); + empty = false; + } + } + } + + empty = true; + + if (!single_str) { + max_group_length = next_group_length; + if ((single_str = GDKmalloc(max_group_length + 1)) == NULL) + goto allocation_error; + } else if (next_group_length > max_group_length) { + max_group_length = next_group_length; + if ((next_single_str = GDKrealloc(single_str, max_group_length + 1)) == NULL) + goto allocation_error; + single_str = next_single_str; + } + + for (j = start[i]; j < l; j++) { + sb = BUNtvar(bi, j); + + if (separator) { + if (GDK_STRNIL(sb)) + continue; + if (!empty) { + memcpy(single_str + offset, separator, separator_length); + offset += separator_length; + } + next_length = strlen(sb); + memcpy(single_str + offset, sb, next_length); + offset += next_length; + empty = false; + } else { /* sep case */ + sl = BUNtvar(bis, j); + + if (GDK_STRNIL(sb)) + continue; + if (!empty && !GDK_STRNIL(sl)) { + next_sep_len = strlen(sl); + memcpy(single_str + offset, sl, next_sep_len); + offset += next_sep_len; + } + next_length = strlen(sb); + memcpy(single_str + offset, sb, next_length); + offset += next_length; + empty = false; + } + } + + single_str[offset] = '\0'; + if (BUNappend(r, single_str, false) != GDK_SUCCEED) + goto allocation_error; + + } + + GDKfree(single_str); + BATsetcount(r, cnt); + r->tnonil = true; + r->tnil = false; + return GDK_SUCCEED; + allocation_error: + GDKfree(single_str); + GDKerror("%s: malloc failure\n", __func__); + return GDK_FAIL; +} diff --git a/sql/backends/monet5/sql_rank.c b/sql/backends/monet5/sql_rank.c --- a/sql/backends/monet5/sql_rank.c +++ b/sql/backends/monet5/sql_rank.c @@ -1656,7 +1656,7 @@ do_covariance_and_correlation(Client cnt str msg = MAL_SUCCEED; (void)cntxt; - if (pci->argc != 5 || ((isaBatType(getArgType(mb, pci, 2)) && getBatType(getArgType(mb, pci, 3)) != TYPE_lng) || + if (pci->argc != 5 || ((isaBatType(getArgType(mb, pci, 3)) && getBatType(getArgType(mb, pci, 3)) != TYPE_lng) || (isaBatType(getArgType(mb, pci, 4)) && getBatType(getArgType(mb, pci, 4)) != TYPE_lng))) { throw(SQL, op, "%s", err); } @@ -1787,3 +1787,78 @@ SQLcorr(Client cntxt, MalBlkPtr mb, MalS return do_covariance_and_correlation(cntxt, mb, stk, pci, "sql.corr", SQLSTATE(42000) "corr(:any_1,:any_1,:lng,:lng)", GDKanalytical_correlation, 0, dbl_nil); } + +str +SQLstrgroup_concat(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) +{ + BAT *r = NULL, *b = NULL, *sep = NULL, *s = NULL, *e = NULL; + int parameters = 2; + gdk_return gdk_res = GDK_SUCCEED; + str msg = MAL_SUCCEED, separator = NULL; + + (void)cntxt; + if (pci->argc != 4 && pci->argc != 5) + throw(SQL, "sql.strgroup_concat", SQLSTATE(42000) "Requires 4 or 5 parameters"); + + if (isaBatType(getArgType(mb, pci, 1))) { + bat *res = getArgReference_bat(stk, pci, 0); + + b = BATdescriptor(*getArgReference_bat(stk, pci, 1)); + if (!b) + throw(SQL, "sql.strgroup_concat", SQLSTATE(HY005) "Cannot access column descriptor"); + voidresultBAT(r, TYPE_str, BATcount(b), b, "sql.strgroup_concat"); + + if (pci->argc == 5) { + if (isaBatType(getArgType(mb, pci, 2))) { + sep = BATdescriptor(*getArgReference_bat(stk, pci, 2)); _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list