Changeset: 877d0c73a0f4 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/877d0c73a0f4 Added Files: monetdb5/modules/mal/ngrams_old.c Modified Files: monetdb5/modules/mal/CMakeLists.txt monetdb5/modules/mal/ngrams.c monetdb5/modules/mal/ngrams.h sql/scripts/49_strings.sql Branch: strimps_v3 Log Message:
Works with sequential pipe for now diffs (truncated from 2186 to 300 lines): diff --git a/monetdb5/modules/mal/CMakeLists.txt b/monetdb5/modules/mal/CMakeLists.txt --- a/monetdb5/modules/mal/CMakeLists.txt +++ b/monetdb5/modules/mal/CMakeLists.txt @@ -41,7 +41,8 @@ target_sources(malmodules projectionpath.c tablet.c tablet.h batcalc.c calc.c - ngrams.c ngrams.h) + ngrams.c ngrams.h + ngrams_old.c) target_include_directories(malmodules PRIVATE diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c --- a/monetdb5/modules/mal/ngrams.c +++ b/monetdb5/modules/mal/ngrams.c @@ -26,8 +26,7 @@ is_prefix(const char *s1, const char *s2 static inline int is_suffix(const char *s1, const char *s2, int s2_len) { - int sl = str_strlen(s1); - return sl < s2_len ? -1 : strcmp(s1 + sl - s2_len, s2); + return strcmp(s1 + strlen(s1) - s2_len, s2); } static inline int @@ -53,11 +52,11 @@ static void ngrams_destroy(Ngrams *ng) { if (ng) { + GDKfree(ng->idx); + GDKfree(ng->sigs); GDKfree(ng->h); - GDKfree(ng->idx); GDKfree(ng->pos); GDKfree(ng->rid); - GDKfree(ng->sigs); } GDKfree(ng); } @@ -71,7 +70,7 @@ ngrams_create(size_t b_cnt, size_t ng_sz ng->sigs = GDKmalloc(b_cnt * sizeof(NGRAM_TYPE)); ng->h = GDKmalloc(ng_sz * sizeof(unsigned)); ng->pos = GDKmalloc(ng_sz * sizeof(unsigned)); - ng->rid = GDKzalloc(NGRAM_MULTIPLE * b_cnt * sizeof(unsigned)); + ng->rid = GDKmalloc(NGRAM_MULTIPLE * b_cnt * sizeof(unsigned)); } if (!ng || !ng->idx || !ng->sigs || !ng->h || !ng->pos || !ng->rid) { ngrams_destroy(ng); @@ -80,684 +79,6 @@ ngrams_create(size_t b_cnt, size_t ng_sz return ng; } -static Ngrams * -ngrams_create_old(BAT *b, size_t ngramsize) -{ - Ngrams *n = NULL; - size_t sz = BATcount(b); - - n = (Ngrams*)GDKmalloc(sizeof(Ngrams)); - if (n) { - n->h = (unsigned int*)GDKmalloc(ngramsize*sizeof(int)); - n->pos = (unsigned int*)GDKzalloc(ngramsize*sizeof(int)); - n->rid = (unsigned int*)GDKmalloc(NGRAM_MULTIPLE* sz * sizeof(int)); - n->idx = (NGRAM_TYPE*)GDKmalloc(ngramsize*sizeof(NGRAM_TYPE)); - n->sigs = (NGRAM_TYPE*)GDKmalloc(sz * sizeof(NGRAM_TYPE)); - } - if (!n || !n->h || !n->idx || !n->pos || !n->rid || !n->sigs) { - ngrams_destroy(n); - return NULL; - } - return n; -} - -static int -ngrams_init_1gram(Ngrams *n, BAT *b) -{ - BUN cnt = BATcount(b); - NGRAM_TYPE *h = (NGRAM_TYPE *)GDKzalloc(UNIGRAM_SZ*sizeof(NGRAM_TYPE)), *hist = (NGRAM_TYPE*)h, sum = 0; - int *id = (int*)GDKmalloc(UNIGRAM_SZ*sizeof(int)), i; - NGRAM_TYPE *idx = n->idx; - - if (!h || !id) { - GDKfree(h); - GDKfree(id); - return -1; - } - - BATiter bi = bat_iterator(b); - for(BUN i=0; i<cnt; i++) { - const char *s = BUNtail(bi,i); - if (!strNil(s) && *s) { /* skipped */ - for(; *s; s++) { - h[CHAR_MAP(*s)]++; - } - } - } - bat_iterator_end(&bi); - - int bc = 0; - - for(int i=0; i<UNIGRAM_SZ; i++) { - id[i] = i; - idx[i] = 0; - n->h[i] = (unsigned int)hist[i]; - } - GDKqsort(h, id, NULL, UNIGRAM_SZ, sizeof(NGRAM_TYPE), sizeof(int), NGRAM_TYPEID, true, false); - for(i=UNIGRAM_SZ-1; i>=0; i--) { - if ((size_t)(sum + hist[i]) >= (NGRAM_MULTIPLE*cnt)-1) - break; - sum += hist[i]; - } - NGRAM_TYPE larger_cnt = hist[i]; - for(; hist[i] == larger_cnt; i++) - ; - NGRAM_TYPE max = hist[0], small = hist[i]; - n->max = max; - n->min = small; - - for(int i=0; i<UNIGRAM_SZ && hist[i] > 0; i++) { - unsigned int x=id[i]; - idx[x] = NGRAM_CST(1)<<bc; - assert(idx[x] > 0); - bc++; - bc %= NGRAM_BITS; - } - - bi = bat_iterator(b); - NGRAM_TYPE *sp = n->sigs; - unsigned int pos = 1; - for(BUN i=0; i<cnt; i++) { - const char *s = BUNtail(bi, i); - NGRAM_TYPE sig = 0; - if (!strNil(s) && s[0]) { /* too short skipped */ - for(; *s; s++) { - int k = CHAR_MAP(*s); - sig |= idx[k]; - if (n->h[k] <= n->min) { - if (n->pos[k] == 0) { - n->pos[k] = pos; - pos += n->h[k]; - n->h[k] = 0; - } - /* deduplicate */ - int done = (n->h[k] > 0 && n->rid[n->pos[k] + n->h[k]-1] == i); - if (!done) { - n->rid[n->pos[k] + n->h[k]] = i; - n->h[k]++; - } - } - } - *sp = sig; - } else { - *sp = NGRAM_TYPENIL; - } - sp++; - } - bat_iterator_end(&bi); - - GDKfree(h); - GDKfree(id); - return 0; -} - -static str -NGc1join_intern(bat *L, bat *R, bat *H, bat *N, bat *lc, bat *rc, bit *nil_matches, lng *estimate, bit *anti) -{ - (void)nil_matches; - (void)estimate; - BAT *h = BATdescriptor(*H); - BAT *n = BATdescriptor(*N); - - if (lc && !is_bat_nil(*lc)) - assert(0); - if (rc && !is_bat_nil(*rc)) - assert(0); - - if (*anti) - throw(MAL, "gram.c1", "No anti contains yet\n"); - if (!h || !n) { - BBPreclaim(h); - BBPreclaim(n); - throw(MAL, "gram.c1", RUNTIME_OBJECT_MISSING); - } - - if (BATcount(n) < 10) { - printf("todo fall back to select \n"); - } - - Ngrams *ngi = ngrams_create_old(h, UNIGRAM_SZ); - if (ngi && ngrams_init_1gram(ngi, h) == 0) { /* TODO add locks and only create ngram once for full (parent bat) */ - BUN cnt = BATcount(h); - /* create L/R */ - BAT *l = COLnew(0, TYPE_oid, 10*cnt, TRANSIENT); - BAT *r = COLnew(0, TYPE_oid, 10*cnt, TRANSIENT); - - int ncnt = 0, ncnt1 = 0, ncnt2 = 0, ncnt3 = 0, ncnt4 = 0, ncnt5 = 0; - BATiter ni = bat_iterator(n); - BATiter hi = bat_iterator(h); - NGRAM_TYPE nmax = 0; - oid *ol = Tloc(l, 0), *el = ol + 10*cnt; - oid *or = Tloc(r, 0); - cnt = BATcount(n); - /* if needed grow */ - for(BUN i = 0; i<cnt; i++) { - const char *s = BUNtail(ni,i), *os = s; - NGRAM_TYPE sig = 0; - - if ((ol+1000) > el) - break; - if (!strNil(s) && s[0]) { - NGRAM_TYPE min = ngi->max; - unsigned int min_pos = 0; - for(; *s; s++) { - unsigned int k = CHAR_MAP(*s); - sig |= ngi->idx[k]; - if (ngi->h[k] < min) { - min = ngi->h[k]; - min_pos = k; /* encoded min ngram */ - } - } - ncnt++; - if (min <= ngi->min) { - unsigned int rr = ngi->pos[min_pos]; - int hcnt = ngi->h[min_pos]; - ncnt1++; - for(int k = 0; k<hcnt; k++, rr++) { - unsigned int hr = ngi->rid[rr]; - if (((ngi->sigs[hr] & sig) == sig)) { - char *hs = BUNtail(hi, hr); - ncnt3++; - if (strstr(hs, os) != NULL) { - *ol++ = hr; - *or++ = (oid)i; - } - } - } - } else { - unsigned int hcnt = BATcount(h); - ncnt2++; - for(size_t k = 0; k < hcnt; k++) { - if (((ngi->sigs[k] & sig) == sig)) { - char *hs = BUNtail(hi, k); - ncnt4++; - if (strstr(hs, os) != NULL) { - *ol++ = k; - *or++ = (oid)i; - } - } - } - } - if (min > nmax) - nmax = min; - } else if (!strNil(s)) { /* skipped */ - unsigned int hcnt = BATcount(h); - ncnt++; - for(size_t k = 0; k < hcnt; k++) { - char *hs = BUNtail(hi, k); - ncnt5++; - if (strstr(hs, os) != NULL) { - *ol++ = k; - *or++ = (oid)i; - } - } - } - } - bat_iterator_end(&ni); - bat_iterator_end(&hi); - BBPreclaim(h); - BBPreclaim(n); - BATsetcount(l, ol - (oid*)Tloc(l, 0)); - BATsetcount(r, ol - (oid*)Tloc(l, 0)); - *L = l->batCacheid; - *R = r->batCacheid; - BBPkeepref(l); - BBPkeepref(r); - ngrams_destroy(ngi); - return MAL_SUCCEED; - } - BBPreclaim(h); - BBPreclaim(n); - throw(MAL, "gram.c1", SQLSTATE(HY013) MAL_MALLOC_FAIL); -} - -static str -NGc1join1(bat *L, bat *sigs, bat *needle, bat *lc, bit *nil_matches, lng *estimate, bit *anti) -{ - return NGc1join_intern(L, NULL, sigs, needle, lc, NULL, nil_matches, estimate, anti); -} - -static str -NGc1join(bat *L, bat *R, bat *sigs, bat *needle, bat *lc, bat *rc, bit *nil_matches, lng *estimate, bit *anti) -{ - return NGc1join_intern(L, R, sigs, needle, lc, rc, nil_matches, estimate, anti); -} - -static int -ngrams_init_2gram(Ngrams *n, BAT *b) -{ - BUN cnt = BATcount(b); _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org