Changeset: 7a088315370a for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=7a088315370a Modified Files: gdk/gdk_analytic.h gdk/gdk_analytic_func.c sql/backends/monet5/sql.c sql/backends/monet5/sql_rank.c sql/backends/monet5/sql_rank.h Branch: window-tunning Log Message:
Updated covariance and correlation diffs (truncated from 1705 to 300 lines): diff --git a/gdk/gdk_analytic.h b/gdk/gdk_analytic.h --- a/gdk/gdk_analytic.h +++ b/gdk/gdk_analytic.h @@ -41,8 +41,8 @@ gdk_export gdk_return GDKanalytical_stdd gdk_export gdk_return GDKanalytical_stddev_pop(BAT *r, BAT *p, BAT *o, BAT *b, BAT *s, BAT *e, int tpe, int frame_type); gdk_export gdk_return GDKanalytical_variance_samp(BAT *r, BAT *p, BAT *o, BAT *b, BAT *s, BAT *e, int tpe, int frame_type); gdk_export gdk_return GDKanalytical_variance_pop(BAT *r, BAT *p, BAT *o, BAT *b, BAT *s, BAT *e, int tpe, int frame_type); -gdk_export gdk_return GDKanalytical_covariance_pop(BAT *r, BAT *b1, BAT *b2, BAT *s, BAT *e, int tpe); -gdk_export gdk_return GDKanalytical_covariance_samp(BAT *r, BAT *b1, BAT *b2, BAT *s, BAT *e, int tpe); -gdk_export gdk_return GDKanalytical_correlation(BAT *r, BAT *b1, BAT *b2, BAT *s, BAT *e, int tpe); +gdk_export gdk_return GDKanalytical_covariance_pop(BAT *r, BAT *p, BAT *o, BAT *b1, BAT *b2, BAT *s, BAT *e, int tpe, int frame_type); +gdk_export gdk_return GDKanalytical_covariance_samp(BAT *r, BAT *p, BAT *o, BAT *b1, BAT *b2, BAT *s, BAT *e, int tpe, int frame_type); +gdk_export gdk_return GDKanalytical_correlation(BAT *r, BAT *p, BAT *o, BAT *b1, BAT *b2, BAT *s, BAT *e, int tpe, int frame_type); #endif //_GDK_ANALYTIC_H_ diff --git a/gdk/gdk_analytic_func.c b/gdk/gdk_analytic_func.c --- a/gdk/gdk_analytic_func.c +++ b/gdk/gdk_analytic_func.c @@ -2623,6 +2623,7 @@ GDKanalyticalavginteger(BAT *r, BAT *p, #define ANALYTICAL_STDEV_VARIANCE_UNBOUNDED_TILL_CURRENT_ROW(TPE, SAMPLE, OP) \ do { \ + TPE *bp = (TPE*)Tloc(b, 0); \ for (; k < i;) { \ j = k; \ do { \ @@ -2653,10 +2654,11 @@ GDKanalyticalavginteger(BAT *r, BAT *p, #define ANALYTICAL_STDEV_VARIANCE_CURRENT_ROW_TILL_UNBOUNDED(TPE, SAMPLE, OP) \ do { \ + TPE *bp = (TPE*)Tloc(b, 0); \ l = i - 1; \ for (j = l; ; j--) { \ TPE v = bp[j]; \ - if (!is_##TPE##_nil(bp[j])) { \ + if (!is_##TPE##_nil(v)) { \ n++; \ delta = (dbl) v - mean; \ mean += delta / n; \ @@ -2686,6 +2688,7 @@ GDKanalyticalavginteger(BAT *r, BAT *p, #define ANALYTICAL_STDEV_VARIANCE_ALL_ROWS(TPE, SAMPLE, OP) \ do { \ + TPE *bp = (TPE*)Tloc(b, 0); \ for (; j < i; j++) { \ TPE v = bp[j]; \ if (is_##TPE##_nil(v)) \ @@ -2712,7 +2715,6 @@ GDKanalyticalavginteger(BAT *r, BAT *p, #define ANALYTICAL_STDEV_VARIANCE_CURRENT_ROW(TPE, SAMPLE, OP) \ do { \ - (void) bp; \ for (; k < i; k++) \ rb[k] = SAMPLE == 1 ? dbl_nil : 0; \ has_nils = is_dbl_nil(rb[k - 1]); \ @@ -2720,6 +2722,7 @@ GDKanalyticalavginteger(BAT *r, BAT *p, #define ANALYTICAL_STDEV_VARIANCE_OTHERS(TPE, SAMPLE, OP) \ do { \ + TPE *bp = (TPE*)Tloc(b, 0); \ for (; k < i; k++) { \ TPE *bs = bp + start[k], *be = bp + end[k]; \ for (; bs < be; bs++) { \ @@ -2745,9 +2748,8 @@ GDKanalyticalavginteger(BAT *r, BAT *p, } \ } while (0) -#define ANALYTICAL_STDEV_VARIANCE_PARTITIONS(TPE, SAMPLE, OP, IMP) \ +#define ANALYTICAL_STATISTICS_PARTITIONS(TPE, SAMPLE, OP, IMP) \ do { \ - TPE *bp = (TPE*)Tloc(b, 0); \ if (p) { \ for (; i < cnt; i++) { \ if (np[i]) \ @@ -2759,30 +2761,36 @@ GDKanalyticalavginteger(BAT *r, BAT *p, } while (0) #ifdef HAVE_HGE -#define ANALYTICAL_STDEV_VARIANCE_LIMIT(IMP, SAMPLE, OP) \ +#define ANALYTICAL_STATISTICS_LIMIT(IMP, SAMPLE, OP) \ case TYPE_hge: \ - ANALYTICAL_STDEV_VARIANCE_PARTITIONS(hge, SAMPLE, OP, ANALYTICAL_STDEV_VARIANCE_##IMP); \ + ANALYTICAL_STATISTICS_PARTITIONS(hge, SAMPLE, OP, ANALYTICAL_##IMP); \ break; #else -#define ANALYTICAL_STDEV_VARIANCE_LIMIT(IMP, SAMPLE, OP) +#define ANALYTICAL_STATISTICS_LIMIT(IMP, SAMPLE, OP) #endif -#define ANALYTICAL_STDEV_VARIANCE_BRANCHES(IMP, SAMPLE, OP) \ +#define ANALYTICAL_STATISTICS_BRANCHES(IMP, SAMPLE, OP) \ do { \ switch (tpe) { \ case TYPE_bte: \ - ANALYTICAL_STDEV_VARIANCE_PARTITIONS(bte, SAMPLE, OP, ANALYTICAL_STDEV_VARIANCE_##IMP); \ + ANALYTICAL_STATISTICS_PARTITIONS(bte, SAMPLE, OP, ANALYTICAL_##IMP); \ break; \ case TYPE_sht: \ - ANALYTICAL_STDEV_VARIANCE_PARTITIONS(sht, SAMPLE, OP, ANALYTICAL_STDEV_VARIANCE_##IMP); \ + ANALYTICAL_STATISTICS_PARTITIONS(sht, SAMPLE, OP, ANALYTICAL_##IMP); \ break; \ case TYPE_int: \ - ANALYTICAL_STDEV_VARIANCE_PARTITIONS(int, SAMPLE, OP, ANALYTICAL_STDEV_VARIANCE_##IMP); \ + ANALYTICAL_STATISTICS_PARTITIONS(int, SAMPLE, OP, ANALYTICAL_##IMP); \ break; \ case TYPE_lng: \ - ANALYTICAL_STDEV_VARIANCE_PARTITIONS(lng, SAMPLE, OP, ANALYTICAL_STDEV_VARIANCE_##IMP); \ + ANALYTICAL_STATISTICS_PARTITIONS(lng, SAMPLE, OP, ANALYTICAL_##IMP); \ + break; \ + case TYPE_flt: \ + ANALYTICAL_STATISTICS_PARTITIONS(flt, SAMPLE, OP, ANALYTICAL_##IMP); \ break; \ - ANALYTICAL_STDEV_VARIANCE_LIMIT(IMP, SAMPLE, OP) \ + case TYPE_dbl: \ + ANALYTICAL_STATISTICS_PARTITIONS(dbl, SAMPLE, OP, ANALYTICAL_##IMP); \ + break; \ + ANALYTICAL_STATISTICS_LIMIT(IMP, SAMPLE, OP) \ default: \ goto nosupport; \ } \ @@ -2801,23 +2809,23 @@ GDKanalytical_##NAME(BAT *r, BAT *p, BAT \ switch (frame_type) { \ case 3: /* unbounded until current row */ { \ - ANALYTICAL_STDEV_VARIANCE_BRANCHES(UNBOUNDED_TILL_CURRENT_ROW, SAMPLE, OP); \ + ANALYTICAL_STATISTICS_BRANCHES(STDEV_VARIANCE_UNBOUNDED_TILL_CURRENT_ROW, SAMPLE, OP); \ } break; \ case 4: /* current row until unbounded */ { \ - ANALYTICAL_STDEV_VARIANCE_BRANCHES(CURRENT_ROW_TILL_UNBOUNDED, SAMPLE, OP); \ + ANALYTICAL_STATISTICS_BRANCHES(STDEV_VARIANCE_CURRENT_ROW_TILL_UNBOUNDED, SAMPLE, OP); \ } break; \ case 5: /* all rows */ { \ - ANALYTICAL_STDEV_VARIANCE_BRANCHES(ALL_ROWS, SAMPLE, OP); \ + ANALYTICAL_STATISTICS_BRANCHES(STDEV_VARIANCE_ALL_ROWS, SAMPLE, OP); \ } break; \ case 6: /* current row */ { \ - ANALYTICAL_STDEV_VARIANCE_BRANCHES(CURRENT_ROW, SAMPLE, OP); \ + ANALYTICAL_STATISTICS_BRANCHES(STDEV_VARIANCE_CURRENT_ROW, SAMPLE, OP); \ } break; \ default: { \ - ANALYTICAL_STDEV_VARIANCE_BRANCHES(OTHERS, SAMPLE, OP); \ + ANALYTICAL_STATISTICS_BRANCHES(STDEV_VARIANCE_OTHERS, SAMPLE, OP); \ } \ } \ \ - BATsetcount(r, cnt); \ + BATsetcount(r, (BUN) cnt); \ r->tnonil = !has_nils; \ r->tnil = has_nils; \ return GDK_SUCCEED; \ @@ -2834,16 +2842,123 @@ GDK_ANALYTICAL_STDEV_VARIANCE(stddev_pop GDK_ANALYTICAL_STDEV_VARIANCE(variance_samp, 1, m2 / (n - 1), "variance") GDK_ANALYTICAL_STDEV_VARIANCE(variance_pop, 0, m2 / n, "variance") -#define ANALYTICAL_COVARIANCE_CALC(TPE, SAMPLE, OP) \ +#define ANALYTICAL_COVARIANCE_UNBOUNDED_TILL_CURRENT_ROW(TPE, SAMPLE, OP) \ + do { \ + TPE *bp1 = (TPE*)Tloc(b1, 0), *bp2 = (TPE*)Tloc(b2, 0); \ + for (; k < i;) { \ + j = k; \ + do { \ + TPE v1 = bp1[k], v2 = bp2[k]; \ + if (!is_##TPE##_nil(v1) && !is_##TPE##_nil(v2)) { \ + n++; \ + delta1 = (dbl) v1 - mean1; \ + mean1 += delta1 / n; \ + delta2 = (dbl) v2 - mean2; \ + mean2 += delta2 / n; \ + m2 += delta1 * ((dbl) v2 - mean2); \ + } \ + k++; \ + } while (k < i && !op[k]); \ + if (isinf(m2)) \ + goto overflow; \ + if (n > SAMPLE) { \ + for (; j < k; j++) \ + rb[j] = OP; \ + } else { \ + for (; j < k; j++) \ + rb[j] = dbl_nil; \ + has_nils = true; \ + } \ + } \ + n = 0; \ + mean1 = 0; \ + mean2 = 0; \ + m2 = 0; \ + } while (0) + +#define ANALYTICAL_COVARIANCE_CURRENT_ROW_TILL_UNBOUNDED(TPE, SAMPLE, OP) \ + do { \ + TPE *bp1 = (TPE*)Tloc(b1, 0), *bp2 = (TPE*)Tloc(b2, 0); \ + l = i - 1; \ + for (j = l; ; j--) { \ + TPE v1 = bp1[j], v2 = bp2[j]; \ + if (!is_##TPE##_nil(v1) && !is_##TPE##_nil(v2)) { \ + n++; \ + delta1 = (dbl) v1 - mean1; \ + mean1 += delta1 / n; \ + delta2 = (dbl) v2 - mean2; \ + mean2 += delta2 / n; \ + m2 += delta1 * ((dbl) v2 - mean2); \ + } \ + if (op[j] || j == k) { \ + if (isinf(m2)) \ + goto overflow; \ + if (n > SAMPLE) { \ + for (; l >= j; l--) \ + rb[l] = OP; \ + } else { \ + for (; l >= j; l--) \ + rb[l] = dbl_nil; \ + has_nils = true; \ + } \ + if (j == k) \ + break; \ + l = j - 1; \ + } \ + } \ + n = 0; \ + mean1 = 0; \ + mean2 = 0; \ + m2 = 0; \ + k = i; \ + } while (0) + +#define ANALYTICAL_COVARIANCE_ALL_ROWS(TPE, SAMPLE, OP) \ + do { \ + TPE *bp1 = (TPE*)Tloc(b1, 0), *bp2 = (TPE*)Tloc(b2, 0); \ + for (; j < i; j++) { \ + TPE v1 = bp1[j], v2 = bp2[j]; \ + if (!is_##TPE##_nil(v1) && !is_##TPE##_nil(v2)) { \ + n++; \ + delta1 = (dbl) v1 - mean1; \ + mean1 += delta1 / n; \ + delta2 = (dbl) v2 - mean2; \ + mean2 += delta2 / n; \ + m2 += delta1 * ((dbl) v2 - mean2); \ + } \ + } \ + if (isinf(m2)) \ + goto overflow; \ + if (n > SAMPLE) { \ + for (; k < i; k++) \ + rb[k] = OP; \ + } else { \ + for (; k < i; k++) \ + rb[k] = dbl_nil; \ + has_nils = true; \ + } \ + n = 0; \ + mean1 = 0; \ + mean2 = 0; \ + m2 = 0; \ + } while (0) + +#define ANALYTICAL_COVARIANCE_CURRENT_ROW(TPE, SAMPLE, OP) \ do { \ - TPE *bp1 = (TPE*)Tloc(b1, 0), *bp2 = (TPE*)Tloc(b2, 0), *bs1, *be1, *bs2, v1, v2; \ - for (; i < cnt; i++, rb++) { \ - bs1 = bp1 + start[i]; \ - be1 = bp1 + end[i]; \ - bs2 = bp2 + start[i]; \ + for (; k < i; k++) \ + rb[k] = SAMPLE == 1 ? dbl_nil : 0; \ + has_nils = is_dbl_nil(rb[k - 1]); \ + } while (0) + +#define ANALYTICAL_COVARIANCE_OTHERS(TPE, SAMPLE, OP) \ + do { \ + TPE *bp1 = (TPE*)Tloc(b1, 0), *bp2 = (TPE*)Tloc(b2, 0); \ + for (; k < i; k++) { \ + TPE *bs1 = bp1 + start[k]; \ + TPE *be1 = bp1 + end[k]; \ + TPE *bs2 = bp2 + start[k]; \ for (; bs1 < be1; bs1++, bs2++) { \ - v1 = *bs1; \ - v2 = *bs2; \ + TPE v1 = *bs1, v2 = *bs2; \ if (is_##TPE##_nil(v1) || is_##TPE##_nil(v2)) \ continue; \ n++; \ @@ -2853,13 +2968,13 @@ GDK_ANALYTICAL_STDEV_VARIANCE(variance_p mean2 += delta2 / n; \ m2 += delta1 * ((dbl) v2 - mean2); \ } \ - if (isinf(m2)) { \ + if (isinf(m2)) \ goto overflow; \ - } else if (n > SAMPLE) { \ - *rb = OP; \ + if (n > SAMPLE) { \ + rb[k] = OP; \ } else { \ - *rb = dbl_nil; \ - nils++; \ + rb[k] = dbl_nil; \ + has_nils = true; \ } \ n = 0; \ _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list