Changeset: d1f23189724f for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/d1f23189724f
Branch: default
Log Message:

Merge strimps_update into default


diffs (truncated from 1283 to 300 lines):

diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -190,6 +190,7 @@ gdk_return BATsemijoin(BAT **r1p, BAT **
 BAT *BATsetaccess(BAT *b, restrict_t mode) 
__attribute__((__warn_unused_result__));
 void BATsetcapacity(BAT *b, BUN cnt);
 void BATsetcount(BAT *b, BUN cnt);
+gdk_return BATsetstrimps(BAT *b);
 BAT *BATslice(BAT *b, BUN low, BUN high);
 gdk_return BATsort(BAT **sorted, BAT **order, BAT **groups, BAT *b, BAT *o, 
BAT *g, bool reverse, bool nilslast, bool stable) 
__attribute__((__warn_unused_result__));
 gdk_return BATstr_group_concat(ValPtr res, BAT *b, BAT *s, BAT *sep, bool 
skip_nils, bool nil_if_empty, const char *restrict separator);
@@ -401,7 +402,7 @@ BUN SORTfndfirst(BAT *b, const void *v);
 BUN SORTfndlast(BAT *b, const void *v);
 gdk_return STRMPcreate(BAT *b, BAT *s);
 void STRMPdestroy(BAT *b);
-BAT *STRMPfilter(BAT *b, BAT *s, const char *q);
+BAT *STRMPfilter(BAT *b, BAT *s, const char *q, const bool keep_nils);
 MT_Id THRcreate(void (*f)(void *), void *arg, enum MT_thr_detach d, const char 
*name);
 void *THRdata[THREADDATA];
 void THRdel(Thread t);
diff --git a/gdk/ChangeLog b/gdk/ChangeLog
--- a/gdk/ChangeLog
+++ b/gdk/ChangeLog
@@ -1,6 +1,19 @@
 # ChangeLog file for GDK
 # This file is updated with Maddlog
 
+* Thu Jun  2 2022 Panagiotis Koutsourakis <kutsu...@monetdbsolutions.com>
+- The interface for using strimps has not changed (create an imprint index
+  on a column of a read only table), but now construction happens at the
+  beginning of the first query that uses the strimp and is performed in
+  a multithreaded manner.
+
+* Thu May 12 2022 Panagiotis Koutsourakis <kutsu...@monetdbsolutions.com>
+- Implemented the use of strimps for NOT LIKE queries. The idea is to
+  run the corresponding LIKE query using strimps and take the complement
+  of the result. We keep around NULL values both during strimp filtering
+  and during the pcre part of the LIKE query so that they get discarded
+  automatically when we take the complement.
+
 * Mon Mar 21 2022 Sjoerd Mullender <sjo...@acm.org>
 - The function BBPkeepref now gets a BAT pointer as argument instead of
   a bat id.
diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -1845,9 +1845,10 @@ gdk_export lng IMPSimprintsize(BAT *b);
 
 /* Strimps exported functions */
 gdk_export gdk_return STRMPcreate(BAT *b, BAT *s);
-gdk_export BAT *STRMPfilter(BAT *b, BAT *s, const char *q);
+gdk_export BAT *STRMPfilter(BAT *b, BAT *s, const char *q, const bool 
keep_nils);
 gdk_export void STRMPdestroy(BAT *b);
 gdk_export bool BAThasstrimps(BAT *b);
+gdk_export gdk_return BATsetstrimps(BAT *b);
 
 /* The ordered index structure */
 
diff --git a/gdk/gdk_cand.c b/gdk/gdk_cand.c
--- a/gdk/gdk_cand.c
+++ b/gdk/gdk_cand.c
@@ -330,7 +330,7 @@ BATdiffcand(BAT *a, BAT *b)
                        while (!is_oid_nil(ob) && ob < oa) {
                                ob = canditer_next(&cib);
                        }
-                       if (!is_oid_nil(ob) && oa < ob)
+                       if (is_oid_nil(ob) || oa < ob)
                                *p++ = oa;
                }
        }
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -12,10 +12,11 @@
  * A string imprint is an index that can be used as a prefilter in LIKE
  * queries. It has 2 components:
  *
- * - a header of 64 string element pairs.
+ * - a header of 63 string element pairs.
  *
  * - a 64 bit mask for each string in the BAT that encodes the presence
- *   or absence of each element of the header in the specific item.
+ *   or absence of each element of the header in the specific item or if
+ *   the corresponding entry in the BAT is nil.
  *
  * A string imprint is stored in a new Heap in the BAT, aligned in 8
  * byte (64 bit) words.
@@ -24,12 +25,12 @@
  * header of the strimp is encoded. The least significant byte (v in the
  * schematic below) is the version number. The second (np) is the number
  * of pairs in the header. In the current implementation this is always
- * 64. The next 2 bytes (hs) is the total size of the header in
+ * 63. The next 2 bytes (hs) is the total size of the header in
  * bytes. Finally the fifth byte is the persistence byte. The last 3
  * bytes needed to align to the 8 byte boundary should be zero, and are
  * reserved for future use.
  *
- * The following np bytes are the sizes of the pairs. These can have
+ * The following np + 1 bytes are the sizes of the pairs. These can have
  * values from 2 to 8 and are the number of bytes that the corresponding
  * pair takes up. Following that there are the bytes encoding the actual
  * pairs.
@@ -61,19 +62,21 @@
  *
  * - Take the np most frequent pairs as the Strimp Header.
  *
- * - For each string s in the BAT, construct an np-bit mask, m_s that
- *   encodes the presence or absence of each member of the header in the
- *   string.
+ * - For each string s in the BAT, construct an (np + 1)-bit mask, m_s
+ *   that encodes the presence or absence of each member of the header
+ *   in the string or if s is nil.
  *
  * Filtering with a query string q goes as follows:
  *
- * - Use the strimp header to construct an np-bit mask for q encoding
- *   the presence or absence of each member of the header in q.
+ * - Use the strimp header to construct an (np + 1)-bit mask for q
+ *   encoding the presence or absence of each member of the header in q.
  *
- * - For each bitmask in the strimp compute the bitwise AND of m_s and
- *   q. If the result is equal to q, that means that string s contains
- *   the same strimp header elements as q, so it is kept for more
- *   detailed examination.
+ * - For each bitmask in the strimp, first check if it encodes a nil
+ *   value and keep it if it needs to be kept (this happens for NOT LIKE
+ *   queries). Otherwise compute the bitwise AND of m_s and q. If the
+ *   result is equal to q, that means that string s contains the same
+ *   strimp header elements as q, so it is kept for more detailed
+ *   examination.
  */
 
 #include "monetdb_config.h"
@@ -241,11 +244,6 @@ STRMPmakebitstring(const char *s, Strimp
  * STRIMP_HEADER_SIZE largest elements. This depends on the size of the
  * histogram n. For some small n sorting might be more efficient, but
  * for such inputs the difference should not be noticeable.
- *
- * TODO: Explore if a priority queue (heap construction and 64 extract
- * maximums) is worth it. The tradeoff here is that it will complicate
- * the code but might improve performance. In a debug build the current
- * implementation takes ~200 μs.
  */
 static void
 STRMPchoosePairs(PairHistogramElem *hist, size_t hist_size, CharPair *cp)
@@ -397,6 +395,20 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai
        return res;
 }
 
+/* Read a strimp structure from disk.
+ *
+ * If the pointer b->tstrimps has the value 1, it means that the strimp
+ * is on disk. This routine attempts to read it so that it can be used.
+ *
+ * There are a number of checks made for example we check that the
+ * strimps version on disk matches the one the code recognizes, and that
+ * the number of pairs encoded on disk matches the one we expect. If any
+ * of these checks fail, we remove the file from disk since it is now
+ * unusable, and set the pointer b->tstrimps to 2 so that the strimp
+ * will be recreated.
+ *
+ * This function returns true if at the end we have a valid pointer.
+ */
 static bool
 BATcheckstrimps(BAT *b)
 {
@@ -407,82 +419,77 @@ BATcheckstrimps(BAT *b)
                return false;
 
        assert(b->batCacheid > 0);
+
        if (b->tstrimps == (Strimps *)1) {
-               assert(!GDKinmemory(b->theap->farmid));
-               MT_lock_set(&b->batIdxLock);
-               if (b->tstrimps == (Strimps *)1) {
-                       Strimps *hp;
-                       const char *nme = BBP_physical(b->batCacheid);
-                       int fd;
+               Strimps *hp;
+               const char *nme = BBP_physical(b->batCacheid);
+               int fd;
+
+               MT_thread_setalgorithm("read strimp index from disk");
 
-                       b->tstrimps = NULL;
-                       if ((hp = GDKzalloc(sizeof(Strimps))) != NULL &&
-                           (hp->strimps.farmid = BBPselectfarm(b->batRole, 
b->ttype, strimpheap)) >= 0) {
-                               strconcat_len(hp->strimps.filename,
-                                             sizeof(hp->strimps.filename),
-                                             nme, ".tstrimps", NULL);
+               b->tstrimps = NULL;
+               if ((hp = GDKzalloc(sizeof(Strimps))) != NULL &&
+                   (hp->strimps.farmid = BBPselectfarm(b->batRole, b->ttype, 
strimpheap)) >= 0) {
+                       strconcat_len(hp->strimps.filename,
+                                     sizeof(hp->strimps.filename),
+                                     nme, ".tstrimps", NULL);
 
-                               /* check whether a persisted strimp can be 
found */
-                               if ((fd = GDKfdlocate(hp->strimps.farmid, nme, 
"rb+", "tstrimps")) >= 0) {
-                                       struct stat st;
-                                       uint64_t desc;
-                                       size_t npairs;
-                                       size_t hsize;
-                                       /* Read the 8 byte long strimp
-                                        * descriptor.
-                                        *
-                                        * NPAIRS must be 64 in the
-                                        * current implementation.
-                                        *
-                                        * HSIZE must be between 200 and
-                                        * 584 (inclusive): 8 bytes the
-                                        * descritor, 64 bytes the pair
-                                        * sizes and n*64 bytes the
-                                        * actual pairs where 2 <= n <=
-                                        * 8.
-                                        */
-                                       if (read(fd, &desc, 8) == 8
-                                           && (desc & 0xff) == STRIMP_VERSION
-                                           && ((npairs = NPAIRS(desc)) == 64)
-                                           && (hsize = HSIZE(desc)) >= 200 && 
hsize <= 584
-                                           && ((desc >> 32) & 0xff) == 1 /* 
check the persistence byte */
-                                           && fstat(fd, &st) == 0
-                                           /* TODO: We might need padding in 
the UTF-8 case. */
-                                           && st.st_size >= (off_t) 
(hp->strimps.free = hp->strimps.size =
-                                                                     /* header 
size (desc + offsets + pairs) */
-                                                                     hsize +
-                                                                     /* 
bitmasks */
-                                                                     
BATcount(b)*sizeof(uint64_t))
-                                           && HEAPload(&hp->strimps, nme, 
"tstrimps", false) == GDK_SUCCEED) {
-                                               hp->sizes_base = (uint8_t 
*)hp->strimps.base + 8; /* sizes just after the descriptor */
-                                               hp->pairs_base = hp->sizes_base 
+ npairs;         /* pairs just after the offsets */
-                                               hp->bitstrings_base = 
hp->strimps.base + hsize;        /* bitmasks just after the pairs */
+                       /* check whether a persisted strimp can be found */
+                       if ((fd = GDKfdlocate(hp->strimps.farmid, nme, "rb+", 
"tstrimps")) >= 0) {
+                               struct stat st;
+                               uint64_t desc;
+                               size_t npairs;
+                               size_t hsize;
+                               /* Read the 8 byte long strimp
+                                * descriptor.
+                                *
+                                * HSIZE must be between 200 and
+                                * 584 (inclusive): 8 bytes the
+                                * descritor, 64 bytes the pair
+                                * sizes and n*64 bytes the
+                                * actual pairs where 2 <= n <=
+                                * 8.
+                                */
+                               if (read(fd, &desc, 8) == 8
+                                   && (desc & 0xff) == STRIMP_VERSION
+                                   && ((npairs = NPAIRS(desc)) == STRIMP_PAIRS)
+                                   && (hsize = HSIZE(desc)) >= 200 && hsize <= 
584
+                                   && ((desc >> 32) & 0xff) == 1 /* check the 
persistence byte */
+                                   && fstat(fd, &st) == 0
+                                   /* TODO: We might need padding in the UTF-8 
case. */
+                                   && st.st_size >= (off_t) (hp->strimps.free 
= hp->strimps.size =
+                                                             /* header size 
(desc + offsets + pairs) */
+                                                             hsize +
+                                                             /* bitmasks */
+                                                             
BATcount(b)*sizeof(uint64_t))
+                                   && HEAPload(&hp->strimps, nme, "tstrimps", 
false) == GDK_SUCCEED) {
+                                       hp->sizes_base = (uint8_t 
*)hp->strimps.base + 8; /* sizes just after the descriptor */
+                                       hp->pairs_base = hp->sizes_base + 
STRIMP_HEADER_SIZE;   /* pairs just after the offsets. */
+                                       hp->bitstrings_base = hp->strimps.base 
+ hsize;   /* bitmasks just after the pairs */
 
-                                               close(fd);
-                                               ATOMIC_INIT(&hp->strimps.refs, 
1);
-                                               // STRMPincref(hp);
-                                               hp->strimps.parentid = 
b->batCacheid;
-                                               b->tstrimps = hp;
-                                               TRC_DEBUG(ACCELERATOR, 
"BATcheckstrimps(" ALGOBATFMT "): reusing persisted strimp\n", ALGOBATPAR(b));
-                                               MT_lock_unset(&b->batIdxLock);
-                                               return true;
-                                       }
                                        close(fd);
-                                       /* unlink unusable file */
-                                       GDKunlink(hp->strimps.farmid, BATDIR, 
nme, "tstrimp");
+                                       ATOMIC_INIT(&hp->strimps.refs, 1);
+                                       hp->strimps.parentid = b->batCacheid;
+                                       b->tstrimps = hp;
+                                       TRC_DEBUG(ACCELERATOR, 
"BATcheckstrimps(" ALGOBATFMT "): reusing persisted strimp\n", ALGOBATPAR(b));
+                                       return true;
+                               }
+                               close(fd);
+                               /* unlink unusable file */
+                               GDKunlink(hp->strimps.farmid, BATDIR, nme, 
"tstrimps");
 
-                               }
                        }
-                       GDKfree(hp);
-                       GDKclrerr();    /* we're not currently interested in 
errors */
                }
-               MT_lock_unset(&b->batIdxLock);
+               /* For some reason the index exists but was not read
+                * correctly from disk. Set the pointer to the value 2
+                * to signify that it needs to be recreated.
+                */
+               b->tstrimps = (Strimps *)2;
+               GDKfree(hp);
+               GDKclrerr();    /* we're not currently interested in errors */
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

Reply via email to