Changeset: d1f23189724f for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/d1f23189724f Branch: default Log Message:
Merge strimps_update into default diffs (truncated from 1283 to 300 lines): diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out --- a/clients/Tests/exports.stable.out +++ b/clients/Tests/exports.stable.out @@ -190,6 +190,7 @@ gdk_return BATsemijoin(BAT **r1p, BAT ** BAT *BATsetaccess(BAT *b, restrict_t mode) __attribute__((__warn_unused_result__)); void BATsetcapacity(BAT *b, BUN cnt); void BATsetcount(BAT *b, BUN cnt); +gdk_return BATsetstrimps(BAT *b); BAT *BATslice(BAT *b, BUN low, BUN high); gdk_return BATsort(BAT **sorted, BAT **order, BAT **groups, BAT *b, BAT *o, BAT *g, bool reverse, bool nilslast, bool stable) __attribute__((__warn_unused_result__)); gdk_return BATstr_group_concat(ValPtr res, BAT *b, BAT *s, BAT *sep, bool skip_nils, bool nil_if_empty, const char *restrict separator); @@ -401,7 +402,7 @@ BUN SORTfndfirst(BAT *b, const void *v); BUN SORTfndlast(BAT *b, const void *v); gdk_return STRMPcreate(BAT *b, BAT *s); void STRMPdestroy(BAT *b); -BAT *STRMPfilter(BAT *b, BAT *s, const char *q); +BAT *STRMPfilter(BAT *b, BAT *s, const char *q, const bool keep_nils); MT_Id THRcreate(void (*f)(void *), void *arg, enum MT_thr_detach d, const char *name); void *THRdata[THREADDATA]; void THRdel(Thread t); diff --git a/gdk/ChangeLog b/gdk/ChangeLog --- a/gdk/ChangeLog +++ b/gdk/ChangeLog @@ -1,6 +1,19 @@ # ChangeLog file for GDK # This file is updated with Maddlog +* Thu Jun 2 2022 Panagiotis Koutsourakis <kutsu...@monetdbsolutions.com> +- The interface for using strimps has not changed (create an imprint index + on a column of a read only table), but now construction happens at the + beginning of the first query that uses the strimp and is performed in + a multithreaded manner. + +* Thu May 12 2022 Panagiotis Koutsourakis <kutsu...@monetdbsolutions.com> +- Implemented the use of strimps for NOT LIKE queries. The idea is to + run the corresponding LIKE query using strimps and take the complement + of the result. We keep around NULL values both during strimp filtering + and during the pcre part of the LIKE query so that they get discarded + automatically when we take the complement. + * Mon Mar 21 2022 Sjoerd Mullender <sjo...@acm.org> - The function BBPkeepref now gets a BAT pointer as argument instead of a bat id. diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -1845,9 +1845,10 @@ gdk_export lng IMPSimprintsize(BAT *b); /* Strimps exported functions */ gdk_export gdk_return STRMPcreate(BAT *b, BAT *s); -gdk_export BAT *STRMPfilter(BAT *b, BAT *s, const char *q); +gdk_export BAT *STRMPfilter(BAT *b, BAT *s, const char *q, const bool keep_nils); gdk_export void STRMPdestroy(BAT *b); gdk_export bool BAThasstrimps(BAT *b); +gdk_export gdk_return BATsetstrimps(BAT *b); /* The ordered index structure */ diff --git a/gdk/gdk_cand.c b/gdk/gdk_cand.c --- a/gdk/gdk_cand.c +++ b/gdk/gdk_cand.c @@ -330,7 +330,7 @@ BATdiffcand(BAT *a, BAT *b) while (!is_oid_nil(ob) && ob < oa) { ob = canditer_next(&cib); } - if (!is_oid_nil(ob) && oa < ob) + if (is_oid_nil(ob) || oa < ob) *p++ = oa; } } diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -12,10 +12,11 @@ * A string imprint is an index that can be used as a prefilter in LIKE * queries. It has 2 components: * - * - a header of 64 string element pairs. + * - a header of 63 string element pairs. * * - a 64 bit mask for each string in the BAT that encodes the presence - * or absence of each element of the header in the specific item. + * or absence of each element of the header in the specific item or if + * the corresponding entry in the BAT is nil. * * A string imprint is stored in a new Heap in the BAT, aligned in 8 * byte (64 bit) words. @@ -24,12 +25,12 @@ * header of the strimp is encoded. The least significant byte (v in the * schematic below) is the version number. The second (np) is the number * of pairs in the header. In the current implementation this is always - * 64. The next 2 bytes (hs) is the total size of the header in + * 63. The next 2 bytes (hs) is the total size of the header in * bytes. Finally the fifth byte is the persistence byte. The last 3 * bytes needed to align to the 8 byte boundary should be zero, and are * reserved for future use. * - * The following np bytes are the sizes of the pairs. These can have + * The following np + 1 bytes are the sizes of the pairs. These can have * values from 2 to 8 and are the number of bytes that the corresponding * pair takes up. Following that there are the bytes encoding the actual * pairs. @@ -61,19 +62,21 @@ * * - Take the np most frequent pairs as the Strimp Header. * - * - For each string s in the BAT, construct an np-bit mask, m_s that - * encodes the presence or absence of each member of the header in the - * string. + * - For each string s in the BAT, construct an (np + 1)-bit mask, m_s + * that encodes the presence or absence of each member of the header + * in the string or if s is nil. * * Filtering with a query string q goes as follows: * - * - Use the strimp header to construct an np-bit mask for q encoding - * the presence or absence of each member of the header in q. + * - Use the strimp header to construct an (np + 1)-bit mask for q + * encoding the presence or absence of each member of the header in q. * - * - For each bitmask in the strimp compute the bitwise AND of m_s and - * q. If the result is equal to q, that means that string s contains - * the same strimp header elements as q, so it is kept for more - * detailed examination. + * - For each bitmask in the strimp, first check if it encodes a nil + * value and keep it if it needs to be kept (this happens for NOT LIKE + * queries). Otherwise compute the bitwise AND of m_s and q. If the + * result is equal to q, that means that string s contains the same + * strimp header elements as q, so it is kept for more detailed + * examination. */ #include "monetdb_config.h" @@ -241,11 +244,6 @@ STRMPmakebitstring(const char *s, Strimp * STRIMP_HEADER_SIZE largest elements. This depends on the size of the * histogram n. For some small n sorting might be more efficient, but * for such inputs the difference should not be noticeable. - * - * TODO: Explore if a priority queue (heap construction and 64 extract - * maximums) is worth it. The tradeoff here is that it will complicate - * the code but might improve performance. In a debug build the current - * implementation takes ~200 μs. */ static void STRMPchoosePairs(PairHistogramElem *hist, size_t hist_size, CharPair *cp) @@ -397,6 +395,20 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai return res; } +/* Read a strimp structure from disk. + * + * If the pointer b->tstrimps has the value 1, it means that the strimp + * is on disk. This routine attempts to read it so that it can be used. + * + * There are a number of checks made for example we check that the + * strimps version on disk matches the one the code recognizes, and that + * the number of pairs encoded on disk matches the one we expect. If any + * of these checks fail, we remove the file from disk since it is now + * unusable, and set the pointer b->tstrimps to 2 so that the strimp + * will be recreated. + * + * This function returns true if at the end we have a valid pointer. + */ static bool BATcheckstrimps(BAT *b) { @@ -407,82 +419,77 @@ BATcheckstrimps(BAT *b) return false; assert(b->batCacheid > 0); + if (b->tstrimps == (Strimps *)1) { - assert(!GDKinmemory(b->theap->farmid)); - MT_lock_set(&b->batIdxLock); - if (b->tstrimps == (Strimps *)1) { - Strimps *hp; - const char *nme = BBP_physical(b->batCacheid); - int fd; + Strimps *hp; + const char *nme = BBP_physical(b->batCacheid); + int fd; + + MT_thread_setalgorithm("read strimp index from disk"); - b->tstrimps = NULL; - if ((hp = GDKzalloc(sizeof(Strimps))) != NULL && - (hp->strimps.farmid = BBPselectfarm(b->batRole, b->ttype, strimpheap)) >= 0) { - strconcat_len(hp->strimps.filename, - sizeof(hp->strimps.filename), - nme, ".tstrimps", NULL); + b->tstrimps = NULL; + if ((hp = GDKzalloc(sizeof(Strimps))) != NULL && + (hp->strimps.farmid = BBPselectfarm(b->batRole, b->ttype, strimpheap)) >= 0) { + strconcat_len(hp->strimps.filename, + sizeof(hp->strimps.filename), + nme, ".tstrimps", NULL); - /* check whether a persisted strimp can be found */ - if ((fd = GDKfdlocate(hp->strimps.farmid, nme, "rb+", "tstrimps")) >= 0) { - struct stat st; - uint64_t desc; - size_t npairs; - size_t hsize; - /* Read the 8 byte long strimp - * descriptor. - * - * NPAIRS must be 64 in the - * current implementation. - * - * HSIZE must be between 200 and - * 584 (inclusive): 8 bytes the - * descritor, 64 bytes the pair - * sizes and n*64 bytes the - * actual pairs where 2 <= n <= - * 8. - */ - if (read(fd, &desc, 8) == 8 - && (desc & 0xff) == STRIMP_VERSION - && ((npairs = NPAIRS(desc)) == 64) - && (hsize = HSIZE(desc)) >= 200 && hsize <= 584 - && ((desc >> 32) & 0xff) == 1 /* check the persistence byte */ - && fstat(fd, &st) == 0 - /* TODO: We might need padding in the UTF-8 case. */ - && st.st_size >= (off_t) (hp->strimps.free = hp->strimps.size = - /* header size (desc + offsets + pairs) */ - hsize + - /* bitmasks */ - BATcount(b)*sizeof(uint64_t)) - && HEAPload(&hp->strimps, nme, "tstrimps", false) == GDK_SUCCEED) { - hp->sizes_base = (uint8_t *)hp->strimps.base + 8; /* sizes just after the descriptor */ - hp->pairs_base = hp->sizes_base + npairs; /* pairs just after the offsets */ - hp->bitstrings_base = hp->strimps.base + hsize; /* bitmasks just after the pairs */ + /* check whether a persisted strimp can be found */ + if ((fd = GDKfdlocate(hp->strimps.farmid, nme, "rb+", "tstrimps")) >= 0) { + struct stat st; + uint64_t desc; + size_t npairs; + size_t hsize; + /* Read the 8 byte long strimp + * descriptor. + * + * HSIZE must be between 200 and + * 584 (inclusive): 8 bytes the + * descritor, 64 bytes the pair + * sizes and n*64 bytes the + * actual pairs where 2 <= n <= + * 8. + */ + if (read(fd, &desc, 8) == 8 + && (desc & 0xff) == STRIMP_VERSION + && ((npairs = NPAIRS(desc)) == STRIMP_PAIRS) + && (hsize = HSIZE(desc)) >= 200 && hsize <= 584 + && ((desc >> 32) & 0xff) == 1 /* check the persistence byte */ + && fstat(fd, &st) == 0 + /* TODO: We might need padding in the UTF-8 case. */ + && st.st_size >= (off_t) (hp->strimps.free = hp->strimps.size = + /* header size (desc + offsets + pairs) */ + hsize + + /* bitmasks */ + BATcount(b)*sizeof(uint64_t)) + && HEAPload(&hp->strimps, nme, "tstrimps", false) == GDK_SUCCEED) { + hp->sizes_base = (uint8_t *)hp->strimps.base + 8; /* sizes just after the descriptor */ + hp->pairs_base = hp->sizes_base + STRIMP_HEADER_SIZE; /* pairs just after the offsets. */ + hp->bitstrings_base = hp->strimps.base + hsize; /* bitmasks just after the pairs */ - close(fd); - ATOMIC_INIT(&hp->strimps.refs, 1); - // STRMPincref(hp); - hp->strimps.parentid = b->batCacheid; - b->tstrimps = hp; - TRC_DEBUG(ACCELERATOR, "BATcheckstrimps(" ALGOBATFMT "): reusing persisted strimp\n", ALGOBATPAR(b)); - MT_lock_unset(&b->batIdxLock); - return true; - } close(fd); - /* unlink unusable file */ - GDKunlink(hp->strimps.farmid, BATDIR, nme, "tstrimp"); + ATOMIC_INIT(&hp->strimps.refs, 1); + hp->strimps.parentid = b->batCacheid; + b->tstrimps = hp; + TRC_DEBUG(ACCELERATOR, "BATcheckstrimps(" ALGOBATFMT "): reusing persisted strimp\n", ALGOBATPAR(b)); + return true; + } + close(fd); + /* unlink unusable file */ + GDKunlink(hp->strimps.farmid, BATDIR, nme, "tstrimps"); - } } - GDKfree(hp); - GDKclrerr(); /* we're not currently interested in errors */ } - MT_lock_unset(&b->batIdxLock); + /* For some reason the index exists but was not read + * correctly from disk. Set the pointer to the value 2 + * to signify that it needs to be recreated. + */ + b->tstrimps = (Strimps *)2; + GDKfree(hp); + GDKclrerr(); /* we're not currently interested in errors */ _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org