Changeset: b7d407bb4c1b for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=b7d407bb4c1b
Modified Files:
        gdk/gdk_analytic_func.c
        gdk/gdk_calc.h
        gdk/gdk_string.c
        sql/backends/monet5/sql_rank.c
        sql/backends/monet5/sql_rank.h
        sql/backends/monet5/sql_rank.mal
        sql/backends/monet5/sql_rank.mal.sh
        sql/backends/monet5/sql_upgrades.c
        sql/common/sql_types.c
        sql/scripts/51_sys_schema_extension.sql
        sql/test/group-concat/Tests/groupconcat05.stable.out
Branch: statistics-analytics
Log Message:

Implemented group_concat as a window function. Also fixed null case handling in 
the aggregate version


diffs (truncated from 538 to 300 lines):

diff --git a/gdk/gdk_analytic_func.c b/gdk/gdk_analytic_func.c
--- a/gdk/gdk_analytic_func.c
+++ b/gdk/gdk_analytic_func.c
@@ -1018,8 +1018,7 @@ GDKanalytical##OP(BAT *r, BAT *b, BAT *s
                        }                                               \
                        if (BUNappend(r, curval, false) != GDK_SUCCEED) \
                                goto allocation_error;                  \
-                       if (atomcmp(curval, nil) == 0)                  \
-                               has_nils = true;                        \
+                       has_nils |= atomcmp(curval, nil) == 0;  \
                }                                                       \
        }                                                               \
        }                                                               \
diff --git a/gdk/gdk_calc.h b/gdk/gdk_calc.h
--- a/gdk/gdk_calc.h
+++ b/gdk/gdk_calc.h
@@ -161,5 +161,6 @@ gdk_export BAT *BATgroupcovariance_sampl
 gdk_export BAT *BATgroupcovariance_population(BAT *b1, BAT *b2, BAT *g, BAT 
*e, BAT *s, int tp, bool skip_nils, bool abort_on_error);
 gdk_export BAT *BATgroupcorrelation(BAT *b1, BAT *b2, BAT *g, BAT *e, BAT *s, 
int tp, bool skip_nils, bool abort_on_error);
 
-gdk_export BAT *BATgroupstr_group_concat(BAT *b, BAT *g, BAT *e, BAT *s, BAT 
*sep, bool skip_nils, bool abort_on_error, const char *separator);
-gdk_export gdk_return BATstr_group_concat(ValPtr res, BAT *b, BAT *s, BAT 
*sep, bool skip_nils, bool abort_on_error, bool nil_if_empty, const char 
*separator);
+gdk_export BAT *BATgroupstr_group_concat(BAT *b, BAT *g, BAT *e, BAT *s, BAT 
*sep, bool skip_nils, bool abort_on_error, const char *restrict separator);
+gdk_export gdk_return BATstr_group_concat(ValPtr res, BAT *b, BAT *s, BAT 
*sep, bool skip_nils, bool abort_on_error, bool nil_if_empty, const char 
*restrict separator);
+gdk_export gdk_return GDKanalytical_str_group_concat(BAT *r, BAT *b, BAT *sep, 
BAT *s, BAT *e, const char *restrict separator);
diff --git a/gdk/gdk_string.c b/gdk/gdk_string.c
--- a/gdk/gdk_string.c
+++ b/gdk/gdk_string.c
@@ -814,7 +814,7 @@ static gdk_return
 concat_strings(BAT **bnp, ValPtr pt, BAT *b, oid seqb,
                           BUN ngrp, struct canditer *restrict ci, BUN ncand,
                           const oid *restrict gids, oid min, oid max, bool 
skip_nils,
-                          BAT *sep, const char *separator, BUN *has_nils)
+                          BAT *sep, const char *restrict separator, BUN 
*has_nils)
 {
        oid gid;
        BUN i, p, nils = 0;
@@ -866,15 +866,22 @@ concat_strings(BAT **bnp, ValPtr pt, BAT
                                p = canditer_next(ci) - seqb;
                                s = BUNtvar(bi, p);
                                sl = BUNtvar(bis, p);
-                               if (GDK_STRNIL(s) || GDK_STRNIL(sl)) {
+                               if (GDK_STRNIL(s)) {
                                        if (!skip_nils) {
                                                nils = 1;
                                                break;
                                        }
                                } else {
                                        single_length += strlen(s);
-                                       if (!empty)
-                                               single_length += strlen(sl);
+                                       if (!empty) {
+                                               if (GDK_STRNIL(sl)) {
+                                                       if (!skip_nils) {
+                                                               nils = 1;
+                                                               break;
+                                                       }
+                                               } else
+                                                       single_length += 
strlen(sl);
+                                       }
                                        empty = false;
                                }
                        }
@@ -907,9 +914,9 @@ concat_strings(BAT **bnp, ValPtr pt, BAT
                                        p = canditer_next(ci) - seqb;
                                        s = BUNtvar(bi, p);
                                        sl = BUNtvar(bis, p);
-                                       if (GDK_STRNIL(s) || GDK_STRNIL(sl))
+                                       if (GDK_STRNIL(s))
                                                continue;
-                                       if (!empty) {
+                                       if (!empty && !GDK_STRNIL(sl)) {
                                                next_sep_len = strlen(sl);
                                                memcpy(single_str + offset, sl, 
next_sep_len);
                                                offset += next_sep_len;
@@ -921,11 +928,7 @@ concat_strings(BAT **bnp, ValPtr pt, BAT
                                }
                        }
 
-                       if (empty) {
-                               if (!(single_str = GDKstrdup(str_nil)))
-                                       return GDK_FAIL;
-                       } else
-                               single_str[offset] = '\0';
+                       single_str[offset] = '\0';
                        if (bn) {
                                if (BUNappend(bn, single_str, false) != 
GDK_SUCCEED) {
                                        GDKfree(single_str);
@@ -989,10 +992,14 @@ concat_strings(BAT **bnp, ValPtr pt, BAT
                                                continue;
                                        s = BUNtvar(bi, i);
                                        sl = BUNtvar(bis, i);
-                                       if (!GDK_STRNIL(s) && !GDK_STRNIL(sl)) {
-                                               next_sep_len = strlen(sl);
-                                               lengths[gid] += strlen(s) + 
next_sep_len;
-                                               lastseplength[gid] = 
next_sep_len;
+                                       if (!GDK_STRNIL(s)) {
+                                               lengths[gid] += strlen(s);
+                                               if (!GDK_STRNIL(sl)) {
+                                                       next_sep_len = 
strlen(sl);
+                                                       lengths[gid] += 
next_sep_len;
+                                                       lastseplength[gid] = 
next_sep_len;
+                                               } else
+                                                       lastseplength[gid] = 0;
                                                astrings[gid] = NULL;
                                        } else if (!skip_nils) {
                                                nils++;
@@ -1059,9 +1066,9 @@ concat_strings(BAT **bnp, ValPtr pt, BAT
                                        if (astrings[gid]) {
                                                s = BUNtvar(bi, i);
                                                sl = BUNtvar(bis, i);
-                                               if (GDK_STRNIL(s) || 
GDK_STRNIL(sl))
+                                               if (GDK_STRNIL(s))
                                                        continue;
-                                               if 
(astrings[gid][lengths[gid]]) {
+                                               if (astrings[gid][lengths[gid]] 
&& !GDK_STRNIL(sl)) {
                                                        next_sep_len = 
strlen(sl);
                                                        memcpy(astrings[gid] + 
lengths[gid], sl, next_sep_len);
                                                        lengths[gid] += 
next_sep_len;
@@ -1109,7 +1116,7 @@ finish:
 
 gdk_return
 BATstr_group_concat(ValPtr res, BAT *b, BAT *s, BAT *sep, bool skip_nils, 
-                                       bool abort_on_error, bool nil_if_empty, 
const char *separator)
+                                       bool abort_on_error, bool nil_if_empty, 
const char *restrict separator)
 {
        BUN ncand;
        struct canditer ci;
@@ -1138,7 +1145,7 @@ BATstr_group_concat(ValPtr res, BAT *b, 
 
 BAT *
 BATgroupstr_group_concat(BAT *b, BAT *g, BAT *e, BAT *s, BAT *sep, bool 
skip_nils,
-                                                bool abort_on_error, const 
char *separator)
+                                                bool abort_on_error, const 
char *restrict separator)
 {
        BAT *bn = NULL;
        oid min, max;
@@ -1186,3 +1193,122 @@ BATgroupstr_group_concat(BAT *b, BAT *g,
 
        return bn;
 }
+
+gdk_return
+GDKanalytical_str_group_concat(BAT *r, BAT *b, BAT *sep, BAT *s, BAT *e, const 
char *restrict separator)
+{
+       BUN i = 0, j, l, cnt = BATcount(b);
+       lng *restrict start, *restrict end;
+       BATiter bi, bis;
+       str sb, sl, single_str = NULL, next_single_str;
+       bool empty;
+       size_t separator_length = 0, next_group_length, max_group_length = 0, 
next_length, offset, next_sep_len;
+
+       assert(s && e && ((sep && !separator && BATcount(b) == BATcount(sep)) 
|| (!sep && separator)));
+       start = (lng *) Tloc(s, 0);
+       end = (lng *) Tloc(e, 0);
+
+       if (b->ttype != TYPE_str || r->ttype != TYPE_str || (sep && sep->ttype 
!= TYPE_str)) {
+               GDKerror("BATgroupstr_group_concat: only string type is 
supported\n");
+               return GDK_FAIL;
+       }
+
+       if (sep && BATcount(sep) == 1) { /* Only one element in sep */
+               bi = bat_iterator(sep);
+               separator = BUNtvar(bi, 0);
+               sep = NULL;
+       }
+
+       if (sep)
+               bis = bat_iterator(sep);
+       else
+               separator_length = strlen(separator);
+
+       bi = bat_iterator(b);
+       for (; i < cnt; i++) {
+               l = end[i];
+               empty = true;
+               next_group_length = next_length = offset = 0;
+
+               for (j = start[i]; j < l; j++) {
+                       sb = BUNtvar(bi, j);
+
+                       if (separator) {
+                               if (!GDK_STRNIL(sb)) {
+                                       next_group_length += strlen(sb);
+                                       if (!empty)
+                                               next_group_length += 
separator_length;
+                                       empty = false;
+                               }
+                       } else { /* sep case */
+                               sl = BUNtvar(bis, j);
+
+                               if (!GDK_STRNIL(sb)) {
+                                       next_group_length += strlen(sb);
+                                       if (!empty && !GDK_STRNIL(sl))
+                                               next_group_length += strlen(sl);
+                                       empty = false;
+                               }
+                       }
+               }
+
+               empty = true;
+
+               if (!single_str) {
+                       max_group_length = next_group_length;
+                       if ((single_str = GDKmalloc(max_group_length + 1)) == 
NULL)
+                               goto allocation_error;
+               } else if (next_group_length > max_group_length) {
+                       max_group_length = next_group_length;
+                       if ((next_single_str = GDKrealloc(single_str, 
max_group_length + 1)) == NULL)
+                               goto allocation_error;
+                       single_str = next_single_str;
+               }
+
+               for (j = start[i]; j < l; j++) {
+                       sb = BUNtvar(bi, j);
+
+                       if (separator) {
+                               if (GDK_STRNIL(sb))
+                                       continue;
+                               if (!empty) {
+                                       memcpy(single_str + offset, separator, 
separator_length);
+                                       offset += separator_length;
+                               }
+                               next_length = strlen(sb);
+                               memcpy(single_str + offset, sb, next_length);
+                               offset += next_length;
+                               empty = false;
+                       } else { /* sep case */
+                               sl = BUNtvar(bis, j);
+
+                               if (GDK_STRNIL(sb))
+                                       continue;
+                               if (!empty && !GDK_STRNIL(sl)) {
+                                       next_sep_len = strlen(sl);
+                                       memcpy(single_str + offset, sl, 
next_sep_len);
+                                       offset += next_sep_len;
+                               }
+                               next_length = strlen(sb);
+                               memcpy(single_str + offset, sb, next_length);
+                               offset += next_length;
+                               empty = false;
+                       }
+               }
+
+               single_str[offset] = '\0';
+               if (BUNappend(r, single_str, false) != GDK_SUCCEED)
+                       goto allocation_error;
+
+       }
+
+       GDKfree(single_str);
+       BATsetcount(r, cnt);
+       r->tnonil = true;
+       r->tnil = false;
+       return GDK_SUCCEED;
+ allocation_error:
+       GDKfree(single_str);
+       GDKerror("%s: malloc failure\n", __func__);
+       return GDK_FAIL;
+}
diff --git a/sql/backends/monet5/sql_rank.c b/sql/backends/monet5/sql_rank.c
--- a/sql/backends/monet5/sql_rank.c
+++ b/sql/backends/monet5/sql_rank.c
@@ -1656,7 +1656,7 @@ do_covariance_and_correlation(Client cnt
        str msg = MAL_SUCCEED;
 
        (void)cntxt;
-       if (pci->argc != 5 || ((isaBatType(getArgType(mb, pci, 2)) && 
getBatType(getArgType(mb, pci, 3)) != TYPE_lng) ||
+       if (pci->argc != 5 || ((isaBatType(getArgType(mb, pci, 3)) && 
getBatType(getArgType(mb, pci, 3)) != TYPE_lng) ||
                 (isaBatType(getArgType(mb, pci, 4)) && 
getBatType(getArgType(mb, pci, 4)) != TYPE_lng))) {
                throw(SQL, op, "%s", err);
        }
@@ -1787,3 +1787,78 @@ SQLcorr(Client cntxt, MalBlkPtr mb, MalS
        return do_covariance_and_correlation(cntxt, mb, stk, pci, "sql.corr", 
SQLSTATE(42000) "corr(:any_1,:any_1,:lng,:lng)",
                                                                                
 GDKanalytical_correlation, 0, dbl_nil);
 }
+
+str
+SQLstrgroup_concat(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{
+       BAT *r = NULL, *b = NULL, *sep = NULL, *s = NULL, *e = NULL;
+       int parameters = 2;
+       gdk_return gdk_res = GDK_SUCCEED;
+       str msg = MAL_SUCCEED, separator = NULL;
+
+       (void)cntxt;
+       if (pci->argc != 4 && pci->argc != 5)
+               throw(SQL, "sql.strgroup_concat", SQLSTATE(42000) "Requires 4 
or 5 parameters");
+
+       if (isaBatType(getArgType(mb, pci, 1))) {
+               bat *res = getArgReference_bat(stk, pci, 0);
+
+               b = BATdescriptor(*getArgReference_bat(stk, pci, 1));
+               if (!b)
+                       throw(SQL, "sql.strgroup_concat", SQLSTATE(HY005) 
"Cannot access column descriptor");
+               voidresultBAT(r, TYPE_str, BATcount(b), b, 
"sql.strgroup_concat");
+
+               if (pci->argc == 5) {
+                       if (isaBatType(getArgType(mb, pci, 2))) {
+                               sep = BATdescriptor(*getArgReference_bat(stk, 
pci, 2));
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to