Changeset: 5f5feac01939 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/5f5feac01939 Modified Files: gdk/gdk_histogram.c Branch: histograms Log Message:
Initial attempt for strings histogram diffs (164 lines): diff --git a/gdk/gdk_histogram.c b/gdk/gdk_histogram.c --- a/gdk/gdk_histogram.c +++ b/gdk/gdk_histogram.c @@ -25,6 +25,7 @@ static_assert(NBUCKETS > 2, "The number of buckets must be > 2"); static_assert(SAMPLE_SIZE > NBUCKETS, "The sample size must be > number of buckets"); static_assert(SAMPLE_SIZE < INT_MAX, "The sample size must be on the integer range"); +static_assert((SAMPLE_SIZE % NBUCKETS) == 0, "The sample size must be a multiple of the number of buckets"); #ifdef HAVE_HGE #define VAR_UPCAST hge @@ -59,6 +60,7 @@ can_create_histogram(int tpe) #ifdef HAVE_HGE case TYPE_hge: #endif + case TYPE_str: return true; default: /* type not supported */ return false; @@ -183,6 +185,7 @@ create_perfect_histogram(BAT *sam, Histo break; #endif default: + /* no perfect histograms on strings yet */ assert(0); } return h; @@ -270,6 +273,56 @@ create_generic_histogram(BAT *sam, Histo generic_histogram_loop(hge, abshge); break; #endif + case TYPE_str: { /* for strings use the hashed value (it cannot be used in ranges) */ + int i = 0; + const int gap = SAMPLE_SIZE / NBUCKETS; + HistogramBucket_int *restrict hist; + + h->nbuckets = NBUCKETS; + if (!(h->histogram = GDKmalloc(sizeof(HistogramBucket_int) * h->nbuckets))) + return NULL; + + hist = (HistogramBucket_int *) h->histogram; + for (BUN k = 0 ; k < NBUCKETS ; k++) { + HistogramBucket_int *restrict b = &(hist[k]); + b->min = i; + i += gap; + b->max = i; + b->count = 0; + } + + BATiter bi = bat_iterator(sam); + BUN c = bi.count; + for (BUN k = 0 ; k < c ; k++) { /* Now create the histogram */ + const char *next = BUNtvar(bi, k); + + if (strNil(next)) { + nulls++; + } else { + bool found = false; + int l = 0, r = NBUCKETS - 1; + /* pick the lowest bits according to the sample size from the hash value */ + int value = (int)(strHash(next) & (SAMPLE_SIZE - 1)); + + while (l <= r) { /* Do binary search to find the bucket */ + int m = l + (r - l) / 2; + HistogramBucket_int *restrict b = &(hist[m]); + if (value >= b->min && value < b->max) { + b->count++; + found = true; + break; + } else if (value < b->min) { /* value is smaller, ignore right half */ + r = m - 1; + } else { /* value is greater, ignore left half */ + l = m + 1; + } + } + assert(found); /* the value must be found */ + } + } + bat_iterator_end(&bi); + h->nulls = nulls; + } break; default: assert(0); } @@ -306,7 +359,7 @@ HISTOGRAMcreate(BAT *b) BAT *sids, *sam; Histogram *h; bool perfect_histogram = false; - ValRecord min, max, diff = {.vtype = VAR_TPE}, conv = {.vtype = VAR_TPE}, nbuckets = {.vtype = VAR_TPE}, truth = {.vtype = TYPE_bit}; + ValRecord min = {.vtype = bi.type}, max = {.vtype = bi.type}; if (!(sids = BATsample_with_seed(b, SAMPLE_SIZE, (uint64_t) GDKusec() * (uint64_t) b->batCacheid))) goto fail; @@ -315,27 +368,31 @@ HISTOGRAMcreate(BAT *b) if (!sam) goto fail; - if (bi.type == TYPE_bit) { - VALinit(&min, TYPE_bit, &(bit){0}); - VALinit(&max, TYPE_bit, &(bit){1}); - perfect_histogram = true; - } else { - min_and_max_values(&bi, &min, &max); - if (VARcalcsub(&diff, &max, &min, true) == GDK_SUCCEED) { - if (VARconvert(&conv, &diff, true, 0, 0, 0) == GDK_SUCCEED) { - VAR_UPCAST v = (VAR_UPCAST) NBUCKETS; - VALinit(&nbuckets, VAR_TPE, &v); - /* if the number of buckets is greater than max and min difference, then there is a 'perfect' histogram */ - if (VARcalcgt(&truth, &nbuckets, &conv) == GDK_SUCCEED) { - perfect_histogram = truth.val.btval == 1; + if (ATOMbasetype(bi.type) != TYPE_str) { + ValRecord diff = {.vtype = VAR_TPE}, conv = {.vtype = VAR_TPE}, nbuckets = {.vtype = VAR_TPE}, truth = {.vtype = TYPE_bit}; + + if (bi.type == TYPE_bit) { + VALinit(&min, TYPE_bit, &(bit){0}); + VALinit(&max, TYPE_bit, &(bit){1}); + perfect_histogram = true; + } else { + min_and_max_values(&bi, &min, &max); + if (VARcalcsub(&diff, &max, &min, true) == GDK_SUCCEED) { + if (VARconvert(&conv, &diff, true, 0, 0, 0) == GDK_SUCCEED) { + VAR_UPCAST v = (VAR_UPCAST) NBUCKETS; + VALinit(&nbuckets, VAR_TPE, &v); + /* if the number of buckets is greater than max and min difference, then there is a 'perfect' histogram */ + if (VARcalcgt(&truth, &nbuckets, &conv) == GDK_SUCCEED) { + perfect_histogram = truth.val.btval == 1; + } else { + GDKclrerr(); + } } else { GDKclrerr(); } } else { GDKclrerr(); } - } else { - GDKclrerr(); } } @@ -373,6 +430,7 @@ fail: #define histogram_print_loop(TPE) \ do { \ + ssize_t (*atomtostr)(str *, size_t *, const void *, bool) = BATatoms[TYPE_##TPE].atomToStr; \ HistogramBucket_##TPE *restrict hist = (HistogramBucket_##TPE *) b->thistogram->histogram; \ for (int i = 0 ; i < nbuckets ; i++) { \ HistogramBucket_##TPE *restrict hb = &(hist[i]); \ @@ -407,7 +465,6 @@ HISTOGRAMprint(BAT *b) size_t len = 0, rlen = 2048, minlen = 256, maxlen = 256; str res = NULL, minbuf = NULL, maxbuf = NULL; int tpe = ATOMbasetype(b->ttype), nbuckets; - ssize_t (*atomtostr)(str *, size_t *, const void *, bool) = BATatoms[b->ttype].atomToStr; if (VIEWtparent(b)) /* don't look on views */ b = BBP_cache(VIEWtparent(b)); @@ -438,6 +495,7 @@ HISTOGRAMprint(BAT *b) case TYPE_sht: histogram_print_loop(sht); break; + case TYPE_str: /* strings use integer size buckets */ case TYPE_int: histogram_print_loop(int); break; _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org