Changeset: cd3383e679c5 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/cd3383e679c5 Added Files: monetdb5/modules/mal/ngrams.h Modified Files: monetdb5/modules/mal/CMakeLists.txt monetdb5/modules/mal/ngrams.c sql/scripts/49_strings.sql Branch: strimps_v3 Log Message:
Something is wrong diffs (truncated from 1342 to 300 lines): diff --git a/monetdb5/modules/mal/CMakeLists.txt b/monetdb5/modules/mal/CMakeLists.txt --- a/monetdb5/modules/mal/CMakeLists.txt +++ b/monetdb5/modules/mal/CMakeLists.txt @@ -41,7 +41,7 @@ target_sources(malmodules projectionpath.c tablet.c tablet.h batcalc.c calc.c - ngrams.c) + ngrams.c ngrams.h) target_include_directories(malmodules PRIVATE diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c --- a/monetdb5/modules/mal/ngrams.c +++ b/monetdb5/modules/mal/ngrams.c @@ -11,49 +11,12 @@ */ #include "monetdb_config.h" +#include "ngrams.h" #include "mal_interpreter.h" #include "mal_exception.h" #include "string.h" #include "str.h" -#if HAVE_HGE -#define GZ 128 -#define CHAR_MAP(s) (s&127) -#else -#define GZ 64 -#define CHAR_MAP(s) (s&63) -#endif - -#define UNIGRAM_SZ GZ -#define BIGRAM_SZ (GZ*GZ) -#define TRIGRAM_SZ (GZ*GZ*GZ) - -#if HAVE_HGE -#define NGRAM_TYPE hge -#define NGRAM_TYPEID TYPE_hge -#define NGRAM_TYPENIL hge_nil -#define NGRAM_CST(v) ((hge)LL_CONSTANT(v)) -#define NGRAM_BITS 127 -#else -#define NGRAM_TYPE lng -#define NGRAM_TYPEID TYPE_lng -#define NGRAM_TYPENIL lng_nil -#define NGRAM_CST(v) LL_CONSTANT(v) -#define NGRAM_BITS 63 -#endif - -#define NGRAM_MULTIPLE 16 - -#define SET_EMPTY_BAT_PROPS(B) \ - do { \ - B->tnil = false; \ - B->tnonil = true; \ - B->tkey = true; \ - B->tsorted = true; \ - B->trevsorted = true; \ - B->tseqbase = 0; \ - } while (0) - static inline void BBPreclaim_n(int nargs, ...) { @@ -66,15 +29,6 @@ BBPreclaim_n(int nargs, ...) va_end(valist); } -typedef struct { - NGRAM_TYPE *idx; - NGRAM_TYPE *sigs; - NGRAM_TYPE max, min; - unsigned int *h; - unsigned int *pos; - unsigned int *rid; -} Ngrams; - static void ngrams_destroy(Ngrams *ng) { @@ -89,23 +43,702 @@ ngrams_destroy(Ngrams *ng) } static Ngrams * -ngrams_create(size_t b_sz, size_t ng_sz) +ngrams_create(size_t b_cnt, size_t ng_sz) { Ngrams *ng = GDKmalloc(sizeof(Ngrams)); if (ng) { - ng->idx = GDKmalloc(ng_sz * sizeof(NGRAM_TYPE)); - ng->sigs = GDKmalloc(b_sz * sizeof(NGRAM_TYPE)); - ng->h = GDKmalloc(ng_sz * sizeof(unsigned int)); - ng->pos = GDKzalloc(ng_sz * sizeof(unsigned int)); - ng->rid = GDKmalloc(NGRAM_MULTIPLE * b_sz * sizeof(unsigned int)); + ng->idx = GDKzalloc(ng_sz * sizeof(NGRAM_TYPE)); + ng->sigs = GDKzalloc(b_cnt * sizeof(NGRAM_TYPE)); + ng->h = GDKzalloc(ng_sz * sizeof(unsigned)); + ng->pos = GDKzalloc(ng_sz * sizeof(unsigned)); + ng->rid = GDKzalloc(NGRAM_MULTIPLE * b_cnt * sizeof(unsigned)); } - if (!ng || !ng->h || !ng->idx || !ng->pos || !ng->rid || !ng->sigs) { + if (!ng || !ng->idx || !ng->sigs || !ng->h || !ng->pos || !ng->rid) { ngrams_destroy(ng); return NULL; } return ng; } +static Ngrams * +ngrams_create_old(BAT *b, size_t ngramsize) +{ + Ngrams *n = NULL; + size_t sz = BATcount(b); + + n = (Ngrams*)GDKmalloc(sizeof(Ngrams)); + if (n) { + n->h = (unsigned int*)GDKmalloc(ngramsize*sizeof(int)); + n->pos = (unsigned int*)GDKzalloc(ngramsize*sizeof(int)); + n->rid = (unsigned int*)GDKmalloc(NGRAM_MULTIPLE* sz * sizeof(int)); + n->idx = (NGRAM_TYPE*)GDKmalloc(ngramsize*sizeof(NGRAM_TYPE)); + n->sigs = (NGRAM_TYPE*)GDKmalloc(sz * sizeof(NGRAM_TYPE)); + } + if (!n || !n->h || !n->idx || !n->pos || !n->rid || !n->sigs) { + ngrams_destroy(n); + return NULL; + } + return n; +} + +static int +ngrams_init_1gram(Ngrams *n, BAT *b) +{ + BUN cnt = BATcount(b); + NGRAM_TYPE *h = (NGRAM_TYPE *)GDKzalloc(UNIGRAM_SZ*sizeof(NGRAM_TYPE)), *hist = (NGRAM_TYPE*)h, sum = 0; + int *id = (int*)GDKmalloc(UNIGRAM_SZ*sizeof(int)), i; + NGRAM_TYPE *idx = n->idx; + + if (!h || !id) { + GDKfree(h); + GDKfree(id); + return -1; + } + + BATiter bi = bat_iterator(b); + for(BUN i=0; i<cnt; i++) { + const char *s = BUNtail(bi,i); + if (!strNil(s) && *s) { /* skipped */ + for(; *s; s++) { + h[CHAR_MAP(*s)]++; + } + } + } + bat_iterator_end(&bi); + + int bc = 0; + + for(int i=0; i<UNIGRAM_SZ; i++) { + id[i] = i; + idx[i] = 0; + n->h[i] = (unsigned int)hist[i]; + } + GDKqsort(h, id, NULL, UNIGRAM_SZ, sizeof(NGRAM_TYPE), sizeof(int), NGRAM_TYPEID, true, false); + for(i=UNIGRAM_SZ-1; i>=0; i--) { + if ((size_t)(sum + hist[i]) >= (NGRAM_MULTIPLE*cnt)-1) + break; + sum += hist[i]; + } + NGRAM_TYPE larger_cnt = hist[i]; + for(; hist[i] == larger_cnt; i++) + ; + NGRAM_TYPE max = hist[0], small = hist[i]; + n->max = max; + n->min = small; + + for(int i=0; i<UNIGRAM_SZ && hist[i] > 0; i++) { + unsigned int x=id[i]; + idx[x] = NGRAM_CST(1)<<bc; + assert(idx[x] > 0); + bc++; + bc %= NGRAM_BITS; + } + + bi = bat_iterator(b); + NGRAM_TYPE *sp = n->sigs; + unsigned int pos = 1; + for(BUN i=0; i<cnt; i++) { + const char *s = BUNtail(bi, i); + NGRAM_TYPE sig = 0; + if (!strNil(s) && s[0]) { /* too short skipped */ + for(; *s; s++) { + int k = CHAR_MAP(*s); + sig |= idx[k]; + if (n->h[k] <= n->min) { + if (n->pos[k] == 0) { + n->pos[k] = pos; + pos += n->h[k]; + n->h[k] = 0; + } + /* deduplicate */ + int done = (n->h[k] > 0 && n->rid[n->pos[k] + n->h[k]-1] == i); + if (!done) { + n->rid[n->pos[k] + n->h[k]] = i; + n->h[k]++; + } + } + } + *sp = sig; + } else { + *sp = NGRAM_TYPENIL; + } + sp++; + } + bat_iterator_end(&bi); + + GDKfree(h); + GDKfree(id); + return 0; +} + +static str +NGc1join_intern(bat *L, bat *R, bat *H, bat *N, bat *lc, bat *rc, bit *nil_matches, lng *estimate, bit *anti) +{ + (void)nil_matches; + (void)estimate; + BAT *h = BATdescriptor(*H); + BAT *n = BATdescriptor(*N); + + if (lc && !is_bat_nil(*lc)) + assert(0); + if (rc && !is_bat_nil(*rc)) + assert(0); + + if (*anti) + throw(MAL, "gram.c1", "No anti contains yet\n"); + if (!h || !n) { + BBPreclaim(h); + BBPreclaim(n); + throw(MAL, "gram.c1", RUNTIME_OBJECT_MISSING); + } + + if (BATcount(n) < 10) { + printf("todo fall back to select \n"); + } + + Ngrams *ngi = ngrams_create_old(h, UNIGRAM_SZ); + if (ngi && ngrams_init_1gram(ngi, h) == 0) { /* TODO add locks and only create ngram once for full (parent bat) */ + BUN cnt = BATcount(h); + /* create L/R */ + BAT *l = COLnew(0, TYPE_oid, 10*cnt, TRANSIENT); + BAT *r = COLnew(0, TYPE_oid, 10*cnt, TRANSIENT); + + int ncnt = 0, ncnt1 = 0, ncnt2 = 0, ncnt3 = 0, ncnt4 = 0, ncnt5 = 0; + BATiter ni = bat_iterator(n); + BATiter hi = bat_iterator(h); + NGRAM_TYPE nmax = 0; + oid *ol = Tloc(l, 0), *el = ol + 10*cnt; + oid *or = Tloc(r, 0); + cnt = BATcount(n); + /* if needed grow */ + for(BUN i = 0; i<cnt; i++) { + const char *s = BUNtail(ni,i), *os = s; + NGRAM_TYPE sig = 0; + + if ((ol+1000) > el) + break; + if (!strNil(s) && s[0]) { + NGRAM_TYPE min = ngi->max; + unsigned int min_pos = 0; + for(; *s; s++) { + unsigned int k = CHAR_MAP(*s); + sig |= ngi->idx[k]; + if (ngi->h[k] < min) { + min = ngi->h[k]; + min_pos = k; /* encoded min ngram */ + } + } + ncnt++; + if (min <= ngi->min) { + unsigned int rr = ngi->pos[min_pos]; + int hcnt = ngi->h[min_pos]; + ncnt1++; + for(int k = 0; k<hcnt; k++, rr++) { + unsigned int hr = ngi->rid[rr]; + if (((ngi->sigs[hr] & sig) == sig)) { + char *hs = BUNtail(hi, hr); + ncnt3++; + if (strstr(hs, os) != NULL) { + *ol++ = hr; + *or++ = (oid)i; + } + } + } + } else { + unsigned int hcnt = BATcount(h); + ncnt2++; + for(size_t k = 0; k < hcnt; k++) { + if (((ngi->sigs[k] & sig) == sig)) { + char *hs = BUNtail(hi, k); _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org