Changeset: 73c58147a715 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/73c58147a715
Modified Files:
        gdk/gdk.h
        gdk/gdk_strimps.c
        monetdb5/modules/atoms/str.c
        monetdb5/modules/atoms/utf8.h
        monetdb5/modules/mal/pcre.c
Branch: strimps2
Log Message:

add ^first char end last char $ for better startsWith/endsWith support
splified utf8casencmp
swap join arguments in startswith/endswith/contains joins, ie do
crete strimps on the largest of the 2 inputs

fixes (inprogress) of the strimps implementation.


diffs (truncated from 739 to 300 lines):

diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -1898,7 +1898,7 @@ gdk_export lng IMPSimprintsize(BAT *b);
 
 /* Strimps exported functions */
 gdk_export gdk_return STRMPcreate(BAT *b, BAT *s);
-gdk_export BAT *STRMPfilter(BAT *b, BAT *s, const char *q, const bool 
keep_nils);
+gdk_export BAT *STRMPfilter(BAT *b, BAT *s, const char *q, const bool 
keep_nils, bte sce);
 gdk_export void STRMPdestroy(BAT *b);
 gdk_export bool BAThasstrimps(BAT *b);
 gdk_export gdk_return BATsetstrimps(BAT *b);
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -91,8 +91,15 @@
 
 #define STRIMP_VERSION (uint64_t)2
 #define STRIMP_HISTSIZE (256*256)
+
+#define BIT_SHARING 1
 #define STRIMP_HEADER_SIZE 64
+#if BIT_SHARING
+#define STRIMP_PAIRS 2048
+#else
 #define STRIMP_PAIRS (STRIMP_HEADER_SIZE - 1)
+#endif
+
 #define STRIMP_CREATION_THRESHOLD                              \
        ((BUN) ((ATOMIC_GET(&GDKdebug) & FORCEMITOMASK)? 100 : 5000))
 
@@ -108,8 +115,8 @@ typedef struct {
 } CharPair;
 
 typedef struct {
-       size_t pos;
-       size_t lim;
+       ssize_t pos;
+       ssize_t lim;
        const char *s;
 } PairIterator;
 
@@ -150,7 +157,6 @@ pair_equal(const CharPair *p1, const Cha
  */
 
 /* We disregard spaces, digits and punctuation characters */
-#define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x)))
 #define pairToIndex(b1, b2) (size_t)(((uint16_t)b2)<<8 | ((uint16_t)b1))
 
 inline static size_t
@@ -161,13 +167,21 @@ bytes2histindex(uint8_t *bytes, uint8_t 
 }
 
 inline static bool
-pair_at(const PairIterator *pi, CharPair *p)
+pair_at(const PairIterator *pi, CharPair *p, bte sce)
 {
-       if (pi->pos >= pi->lim - 1)
+       if (pi->pos >= (pi->lim - ((sce&2)?1:0)))
                return false;
 
-       p->pbytes[0] = (uint8_t)tolower((unsigned char) pi->s[pi->pos]);
-       p->pbytes[1] = (uint8_t)tolower((unsigned char) pi->s[pi->pos + 1]);
+       if (sce&1 && pi->pos < 0) {
+               p->pbytes[0] = '^';
+               p->pbytes[1] = (uint8_t)tolower((unsigned char) pi->s[0]);
+       } else if (sce&2 && pi->pos == (pi->lim - 1)) {
+               p->pbytes[0] = (uint8_t)tolower((unsigned char) pi->s[pi->pos]);
+               p->pbytes[1] = '$';
+       } else {
+               p->pbytes[0] = (uint8_t)tolower((unsigned char) pi->s[pi->pos]);
+               p->pbytes[1] = (uint8_t)tolower((unsigned char) pi->s[pi->pos + 
1]);
+       }
 
        p->psize = 2;
        p->idx = pairToIndex(p->pbytes[0], p->pbytes[1]);
@@ -175,23 +189,14 @@ pair_at(const PairIterator *pi, CharPair
 }
 
 inline static bool
-next_pair(PairIterator *pi)
+next_pair(PairIterator *pi, bte sce)
 {
-       if (pi->pos >= pi->lim - 1)
+       if (pi->pos >= (pi->lim - ((sce&2)?1:0)))
                return false;
        pi->pos++;
        return true;
 }
 
-/* Returns true if the specified char is ignored.
- */
-inline static bool
-ignored(const CharPair *p, uint8_t elm)
-{
-       assert(elm == 0 || elm == 1);
-       return isIgnored(p->pbytes[elm]);
-}
-
 inline static strimp_masks_t
 STRMPget_mask(const Strimps *r, uint64_t idx)
 {
@@ -223,7 +228,7 @@ STRMPpairLookup(const Strimps *s, const 
  *
  */
 static uint64_t
-STRMPmakebitstring(const char *s, Strimps *r)
+STRMPmakebitstring(const char *s, Strimps *r, bte sce)
 {
        uint64_t ret = 0;
        /* int8_t pair_idx = 0; */
@@ -231,16 +236,16 @@ STRMPmakebitstring(const char *s, Strimp
        CharPair cp;
 
        pi.s = s;
-       pi.pos = 0;
+       pi.pos = (sce&1)?-1:0;
        pi.lim = strlen(s);
 
-       if (pi.lim < 2) {
+       if ((pi.lim + (sce?1:0)) < 2) {
                return ret;
        }
 
-       while(pair_at(&pi, &cp)) {
+       while(pair_at(&pi, &cp, sce)) {
                ret |= STRMPpairLookup(r, &cp);
-               next_pair(&pi);
+               next_pair(&pi, sce);
        }
 
        return ret;
@@ -255,6 +260,30 @@ STRMPmakebitstring(const char *s, Strimp
 
 
 
+#if BIT_SHARING
+static void
+STRMPchoosePairs(const PairHistogramElem *hist, size_t hist_size, CharPair *cp)
+{
+       lng t0 = 0;
+       size_t i;
+
+       TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
+
+       int j = 0, k = 0;
+       for(i = 0; k < STRIMP_PAIRS && k < (int)hist_size && i < hist_size; 
i++) {
+               if (hist[i].cnt == 0)
+                       continue;
+               cp[k].idx = i;
+               cp[k].psize = 2;
+               cp[k].mask = ((uint64_t)0x1) << (STRIMP_PAIRS - j - 1);
+               j++;
+               k++;
+               if (j==63)
+                       j=0;
+       }
+       TRC_DEBUG(ACCELERATOR, LLFMT " usec\n", GDKusec() - t0);
+}
+#else
 static void
 STRMPchoosePairs(const PairHistogramElem *hist, size_t hist_size, CharPair *cp)
 {
@@ -277,6 +306,7 @@ STRMPchoosePairs(const PairHistogramElem
                        }
                }
        }
+       cp[STRIMP_PAIRS] = (CharPair) {.psize = 2};
 
        for(i = 0; i < STRIMP_PAIRS; i++) {
                cp[i].pbytes[1] = (uint8_t)(indices[i] & 0xFF);
@@ -284,11 +314,11 @@ STRMPchoosePairs(const PairHistogramElem
                cp[i].idx = indices[i];
                cp[i].psize = 2;
                cp[i].mask = ((uint64_t)0x1) << (STRIMP_PAIRS - i - 1);
+               printf("cp i=%d '%c' '%c' %d\n", (int)i, cp[i].pbytes[0], 
cp[i].pbytes[1], (int)hist[cp[i].idx].cnt);
        }
-       cp[STRIMP_PAIRS] = (CharPair) {.psize = 2};
-
        TRC_DEBUG(ACCELERATOR, LLFMT " usec\n", GDKusec() - t0);
 }
+#endif
 
 /* Given a BAT b and a candidate list s constructs the header elements
  * of the strimp.
@@ -305,7 +335,7 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai
        size_t hidx;
        oid x;
        size_t hlen;
-       PairHistogramElem *hist;
+       PairHistogramElem *hist = NULL;
        PairIterator pi;
        CharPair cp;
        struct canditer ci;
@@ -330,46 +360,29 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai
        for (i = 0; i < ci.ncand; i++) {
                x = canditer_next(&ci) - b->hseqbase;
                const char *cs = BUNtvar(bi, x);
+               const bte sce = 3;
                if (!strNil(cs)) {
                        pi.s = cs;
                        pi.pos = 0;
                        pi.lim = strlen(pi.s);
-                       if (pi.lim < 2) {
+                       if (pi.lim < 1) {
                                continue;
                        }
-                       while (pair_at(&pi, &cp)) {
-                               if(ignored(&cp, 1)) {
-                                       /* Skip this AND the next pair
-                                        * if the second char of the
-                                        * pair is ignored.
-                                        */
-                                       next_pair(&pi);
-                               } else if (ignored(&cp, 0)) {
-                                       /* Skip this pair if the first
-                                        * char is ignored. This should
-                                        * only happen at the beginnig
-                                        * of a string, since the pair
-                                        * will have been ignored in the
-                                        * previous case.
-                                        */
-                                       ;
-
-                               } else {
-                                       /* hidx = histogram_index(hist, hlen, 
&cp); */
-                                       hidx = cp.idx;
+                       while (pair_at(&pi, &cp, sce)) {
+                               /* hidx = histogram_index(hist, hlen, &cp); */
+                               hidx = cp.idx;
 #ifndef UTF8STRINGS
-                                       assert(hidx < hlen);
+                               assert(hidx < hlen);
 #else
-                                       if (hidx >= hlen) {
-                                               // TODO: Note and realloc. 
Should not happen for bytepairs.
-                                               continue;
-                                       }
+                               if (hidx >= hlen) {
+                                       // TODO: Note and realloc. Should not 
happen for bytepairs.
+                                       continue;
+                               }
 #endif
-                                       if (!hist[hidx].cnt)
-                                               values++;
-                                       hist[hidx].cnt++;
-                               }
-                               next_pair(&pi);
+                               if (!hist[hidx].cnt)
+                                       values++;
+                               hist[hidx].cnt++;
+                               next_pair(&pi, sce);
                        }
                }
        }
@@ -384,6 +397,7 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai
        GDKfree(hist);
 
        TRC_DEBUG(ACCELERATOR, LLFMT " usec\n", GDKusec() - t0);
+       printf("values %d\n", (int)values);
        if (!(res = values >= STRIMP_HEADER_SIZE))
                GDKerror("Not enough distinct values to create strimp index\n");
        return res;
@@ -520,8 +534,18 @@ BATcheckstrimps(BAT *b)
        do {                                                            \
                for (i = 0; i < ci.ncand; i++) {                        \
                        x = next(&ci);                                  \
+                       if ((bitstring_array[x] & qbmask) == qbmask) { \
+                               rvals[j++] = x;                         \
+                       }                                               \
+               }                                                       \
+       } while (0)
+
+#define STRMPfilterloop_keepnils(next)                                 \
+       do {                                                            \
+               for (i = 0; i < ci.ncand; i++) {                        \
+                       x = next(&ci);                                  \
                        if ((bitstring_array[x] & qbmask) == qbmask || \
-                           (keep_nils && (bitstring_array[x] & ((uint64_t)0x1 
<< (STRIMP_HEADER_SIZE - 1))))) { \
+                           (bitstring_array[x] & ((uint64_t)0x1 << 
(STRIMP_HEADER_SIZE - 1)))) { \
                                rvals[j++] = x;                         \
                        }                                               \
                }                                                       \
@@ -537,7 +561,7 @@ BATcheckstrimps(BAT *b)
  * final result.
  */
 BAT *
-STRMPfilter(BAT *b, BAT *s, const char *q, const bool keep_nils)
+STRMPfilter(BAT *b, BAT *s, const char *q, const bool keep_nils, bte sce /* 
include ^first bit 1, last$ bit 2 */)
 {
        BAT *r = NULL;
        BUN i, j = 0;
@@ -583,17 +607,33 @@ STRMPfilter(BAT *b, BAT *s, const char *
                goto sfilter_fail;
        }
 
-       qbmask = STRMPmakebitstring(q, strmps);
+       qbmask = STRMPmakebitstring(q, strmps, sce);
+
        assert((qbmask & ((uint64_t)0x1 << (STRIMP_HEADER_SIZE - 1))) == 0);
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

Reply via email to