Changeset: 877d0c73a0f4 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/877d0c73a0f4
Added Files:
        monetdb5/modules/mal/ngrams_old.c
Modified Files:
        monetdb5/modules/mal/CMakeLists.txt
        monetdb5/modules/mal/ngrams.c
        monetdb5/modules/mal/ngrams.h
        sql/scripts/49_strings.sql
Branch: strimps_v3
Log Message:

Works with sequential pipe for now


diffs (truncated from 2186 to 300 lines):

diff --git a/monetdb5/modules/mal/CMakeLists.txt 
b/monetdb5/modules/mal/CMakeLists.txt
--- a/monetdb5/modules/mal/CMakeLists.txt
+++ b/monetdb5/modules/mal/CMakeLists.txt
@@ -41,7 +41,8 @@ target_sources(malmodules
   projectionpath.c
   tablet.c tablet.h
   batcalc.c calc.c
-  ngrams.c ngrams.h)
+  ngrams.c ngrams.h
+  ngrams_old.c)
 
 target_include_directories(malmodules
   PRIVATE
diff --git a/monetdb5/modules/mal/ngrams.c b/monetdb5/modules/mal/ngrams.c
--- a/monetdb5/modules/mal/ngrams.c
+++ b/monetdb5/modules/mal/ngrams.c
@@ -26,8 +26,7 @@ is_prefix(const char *s1, const char *s2
 static inline int
 is_suffix(const char *s1, const char *s2, int s2_len)
 {
-       int sl = str_strlen(s1);
-       return sl < s2_len ? -1 : strcmp(s1 + sl - s2_len, s2);
+       return strcmp(s1 + strlen(s1) - s2_len, s2);
 }
 
 static inline int
@@ -53,11 +52,11 @@ static void
 ngrams_destroy(Ngrams *ng)
 {
        if (ng) {
+               GDKfree(ng->idx);
+               GDKfree(ng->sigs);
                GDKfree(ng->h);
-               GDKfree(ng->idx);
                GDKfree(ng->pos);
                GDKfree(ng->rid);
-               GDKfree(ng->sigs);
        }
        GDKfree(ng);
 }
@@ -71,7 +70,7 @@ ngrams_create(size_t b_cnt, size_t ng_sz
                ng->sigs = GDKmalloc(b_cnt * sizeof(NGRAM_TYPE));
                ng->h    = GDKmalloc(ng_sz * sizeof(unsigned));
                ng->pos  = GDKmalloc(ng_sz * sizeof(unsigned));
-               ng->rid  = GDKzalloc(NGRAM_MULTIPLE * b_cnt * sizeof(unsigned));
+               ng->rid  = GDKmalloc(NGRAM_MULTIPLE * b_cnt * sizeof(unsigned));
        }
        if (!ng || !ng->idx || !ng->sigs || !ng->h || !ng->pos || !ng->rid) {
                ngrams_destroy(ng);
@@ -80,684 +79,6 @@ ngrams_create(size_t b_cnt, size_t ng_sz
        return ng;
 }
 
-static Ngrams *
-ngrams_create_old(BAT *b, size_t ngramsize)
-{
-       Ngrams *n = NULL;
-       size_t sz = BATcount(b);
-
-       n = (Ngrams*)GDKmalloc(sizeof(Ngrams));
-       if (n) {
-               n->h = (unsigned int*)GDKmalloc(ngramsize*sizeof(int));
-               n->pos = (unsigned int*)GDKzalloc(ngramsize*sizeof(int));
-               n->rid = (unsigned int*)GDKmalloc(NGRAM_MULTIPLE* sz * 
sizeof(int));
-               n->idx = (NGRAM_TYPE*)GDKmalloc(ngramsize*sizeof(NGRAM_TYPE));
-               n->sigs = (NGRAM_TYPE*)GDKmalloc(sz * sizeof(NGRAM_TYPE));
-       }
-       if (!n || !n->h || !n->idx || !n->pos || !n->rid || !n->sigs) {
-               ngrams_destroy(n);
-               return NULL;
-       }
-       return n;
-}
-
-static int
-ngrams_init_1gram(Ngrams *n, BAT *b)
-{
-       BUN cnt = BATcount(b);
-       NGRAM_TYPE *h = (NGRAM_TYPE *)GDKzalloc(UNIGRAM_SZ*sizeof(NGRAM_TYPE)), 
*hist = (NGRAM_TYPE*)h, sum = 0;
-       int *id = (int*)GDKmalloc(UNIGRAM_SZ*sizeof(int)), i;
-       NGRAM_TYPE *idx = n->idx;
-
-       if (!h || !id) {
-               GDKfree(h);
-               GDKfree(id);
-               return -1;
-       }
-
-       BATiter bi = bat_iterator(b);
-       for(BUN i=0; i<cnt; i++) {
-               const char *s = BUNtail(bi,i);
-               if (!strNil(s) && *s) { /* skipped */
-                       for(; *s; s++) {
-                               h[CHAR_MAP(*s)]++;
-                       }
-               }
-       }
-       bat_iterator_end(&bi);
-
-       int bc = 0;
-
-       for(int i=0; i<UNIGRAM_SZ; i++) {
-               id[i] = i;
-               idx[i] = 0;
-               n->h[i] = (unsigned int)hist[i];
-       }
-       GDKqsort(h, id, NULL, UNIGRAM_SZ, sizeof(NGRAM_TYPE), sizeof(int), 
NGRAM_TYPEID, true, false);
-       for(i=UNIGRAM_SZ-1; i>=0; i--) {
-               if ((size_t)(sum + hist[i]) >= (NGRAM_MULTIPLE*cnt)-1)
-                       break;
-               sum += hist[i];
-       }
-       NGRAM_TYPE larger_cnt = hist[i];
-       for(; hist[i] == larger_cnt; i++)
-               ;
-       NGRAM_TYPE max = hist[0], small = hist[i];
-       n->max = max;
-       n->min = small;
-
-       for(int i=0; i<UNIGRAM_SZ && hist[i] > 0; i++) {
-               unsigned int x=id[i];
-               idx[x] = NGRAM_CST(1)<<bc;
-               assert(idx[x] > 0);
-               bc++;
-               bc %= NGRAM_BITS;
-       }
-
-       bi = bat_iterator(b);
-       NGRAM_TYPE *sp = n->sigs;
-       unsigned int pos = 1;
-       for(BUN i=0; i<cnt; i++) {
-               const char *s = BUNtail(bi, i);
-               NGRAM_TYPE sig = 0;
-               if (!strNil(s) && s[0]) { /* too short skipped */
-                       for(; *s; s++) {
-                               int k = CHAR_MAP(*s);
-                               sig |= idx[k];
-                               if (n->h[k] <= n->min) {
-                                       if (n->pos[k] == 0) {
-                                               n->pos[k] = pos;
-                                               pos += n->h[k];
-                                               n->h[k] = 0;
-                                       }
-                                       /* deduplicate */
-                                       int done =  (n->h[k] > 0 && 
n->rid[n->pos[k] + n->h[k]-1] == i);
-                                       if (!done) {
-                                               n->rid[n->pos[k] + n->h[k]] = i;
-                                               n->h[k]++;
-                                       }
-                               }
-                       }
-                       *sp = sig;
-               } else {
-                       *sp = NGRAM_TYPENIL;
-               }
-               sp++;
-       }
-       bat_iterator_end(&bi);
-
-       GDKfree(h);
-       GDKfree(id);
-       return 0;
-}
-
-static str
-NGc1join_intern(bat *L, bat *R, bat *H, bat *N, bat *lc, bat *rc, bit 
*nil_matches, lng *estimate, bit *anti)
-{
-       (void)nil_matches;
-       (void)estimate;
-       BAT *h = BATdescriptor(*H);
-       BAT *n = BATdescriptor(*N);
-
-       if (lc && !is_bat_nil(*lc))
-               assert(0);
-       if (rc && !is_bat_nil(*rc))
-               assert(0);
-
-       if (*anti)
-               throw(MAL, "gram.c1", "No anti contains yet\n");
-       if (!h || !n) {
-               BBPreclaim(h);
-               BBPreclaim(n);
-               throw(MAL, "gram.c1", RUNTIME_OBJECT_MISSING);
-       }
-
-       if (BATcount(n) < 10) {
-               printf("todo fall back to select \n");
-       }
-
-       Ngrams *ngi = ngrams_create_old(h, UNIGRAM_SZ);
-       if (ngi && ngrams_init_1gram(ngi, h) == 0) { /* TODO add locks and only 
create ngram once for full (parent bat) */
-               BUN cnt = BATcount(h);
-               /* create L/R */
-               BAT *l = COLnew(0, TYPE_oid, 10*cnt, TRANSIENT);
-               BAT *r = COLnew(0, TYPE_oid, 10*cnt, TRANSIENT);
-
-               int ncnt = 0, ncnt1 = 0, ncnt2 = 0, ncnt3 = 0, ncnt4 = 0, ncnt5 
= 0;
-               BATiter ni = bat_iterator(n);
-               BATiter hi = bat_iterator(h);
-               NGRAM_TYPE nmax = 0;
-               oid *ol = Tloc(l, 0), *el = ol + 10*cnt;
-               oid *or = Tloc(r, 0);
-               cnt = BATcount(n);
-               /* if needed grow */
-               for(BUN i = 0; i<cnt; i++) {
-                       const char *s = BUNtail(ni,i), *os = s;
-                       NGRAM_TYPE sig = 0;
-
-                       if ((ol+1000) > el)
-                               break;
-                       if (!strNil(s) && s[0]) {
-                               NGRAM_TYPE min = ngi->max;
-                               unsigned int min_pos = 0;
-                               for(; *s; s++) {
-                                       unsigned int k = CHAR_MAP(*s);
-                                       sig |= ngi->idx[k];
-                                       if (ngi->h[k] < min) {
-                                               min = ngi->h[k];
-                                               min_pos = k; /* encoded min 
ngram */
-                                       }
-                               }
-                               ncnt++;
-                               if (min <= ngi->min) {
-                                       unsigned int rr = ngi->pos[min_pos];
-                                       int hcnt = ngi->h[min_pos];
-                                       ncnt1++;
-                                       for(int k = 0; k<hcnt; k++, rr++) {
-                                               unsigned int hr = ngi->rid[rr];
-                                               if (((ngi->sigs[hr] & sig) == 
sig)) {
-                                                       char *hs = BUNtail(hi, 
hr);
-                                                       ncnt3++;
-                                                       if (strstr(hs, os) != 
NULL) {
-                                                               *ol++ = hr;
-                                                               *or++ = (oid)i;
-                                                       }
-                                               }
-                                       }
-                               } else {
-                                       unsigned int hcnt = BATcount(h);
-                                       ncnt2++;
-                                       for(size_t k = 0; k < hcnt; k++) {
-                                               if (((ngi->sigs[k] & sig) == 
sig)) {
-                                                       char *hs = BUNtail(hi, 
k);
-                                                       ncnt4++;
-                                                       if (strstr(hs, os) != 
NULL) {
-                                                               *ol++ = k;
-                                                               *or++ = (oid)i;
-                                                       }
-                                               }
-                                       }
-                               }
-                               if (min > nmax)
-                                       nmax = min;
-                       } else if (!strNil(s)) { /* skipped */
-                               unsigned int hcnt = BATcount(h);
-                               ncnt++;
-                               for(size_t k = 0; k < hcnt; k++) {
-                                       char *hs = BUNtail(hi, k);
-                                       ncnt5++;
-                                       if (strstr(hs, os) != NULL) {
-                                               *ol++ = k;
-                                               *or++ = (oid)i;
-                                       }
-                               }
-                       }
-               }
-               bat_iterator_end(&ni);
-               bat_iterator_end(&hi);
-               BBPreclaim(h);
-               BBPreclaim(n);
-               BATsetcount(l, ol - (oid*)Tloc(l, 0));
-               BATsetcount(r, ol - (oid*)Tloc(l, 0));
-               *L = l->batCacheid;
-               *R = r->batCacheid;
-               BBPkeepref(l);
-               BBPkeepref(r);
-               ngrams_destroy(ngi);
-               return MAL_SUCCEED;
-       }
-       BBPreclaim(h);
-       BBPreclaim(n);
-       throw(MAL, "gram.c1", SQLSTATE(HY013) MAL_MALLOC_FAIL);
-}
-
-static str
-NGc1join1(bat *L, bat *sigs, bat *needle, bat *lc, bit *nil_matches, lng 
*estimate, bit *anti)
-{
-       return NGc1join_intern(L, NULL, sigs, needle, lc, NULL, nil_matches, 
estimate, anti);
-}
-
-static str
-NGc1join(bat *L, bat *R, bat *sigs, bat *needle, bat *lc, bat *rc, bit 
*nil_matches, lng *estimate, bit *anti)
-{
-       return NGc1join_intern(L, R, sigs, needle, lc, rc, nil_matches, 
estimate, anti);
-}
-
-static int
-ngrams_init_2gram(Ngrams *n, BAT *b)
-{
-       BUN cnt = BATcount(b);
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

Reply via email to