Changeset: 4086ef73cf38 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/4086ef73cf38 Added Files: monetdb5/modules/mal/ngram.c monetdb5/modules/mal/ngram.h Modified Files: monetdb5/modules/mal/CMakeLists.txt sql/scripts/48_txtsim.sql Branch: strimps_v3 Log Message:
Integrate module. Compiling. WIP diffs (truncated from 2069 to 300 lines): diff --git a/monetdb5/modules/mal/CMakeLists.txt b/monetdb5/modules/mal/CMakeLists.txt --- a/monetdb5/modules/mal/CMakeLists.txt +++ b/monetdb5/modules/mal/CMakeLists.txt @@ -40,7 +40,8 @@ target_sources(malmodules tracer.c projectionpath.c tablet.c tablet.h - batcalc.c calc.c) + batcalc.c calc.c + ngram.c ngram.h) target_include_directories(malmodules PRIVATE diff --git a/monetdb5/modules/mal/ngram.c b/monetdb5/modules/mal/ngram.c new file mode 100644 --- /dev/null +++ b/monetdb5/modules/mal/ngram.c @@ -0,0 +1,1637 @@ +/* + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2024 MonetDB Foundation; + * Copyright August 2008 - 2023 MonetDB B.V.; + * Copyright 1997 - July 2008 CWI. + */ + +#include <monetdb_config.h> +#include <mal_exception.h> +#include <gdk_cand.h> +#include <gdk_atoms.h> +#include <string.h> + +#define M 1000000 +#if 0 +#define GZ 128 +#define CHAR_MAP(s) (s&127) +#else +#define GZ 64 +#define CHAR_MAP(s) (s&63) +#endif +#define SZ_1GRAM GZ +#define SZ_2GRAM (GZ*GZ) +#define SZ_3GRAM (GZ*GZ*GZ) +#define SZ_4GRAM ((size_t)GZ*GZ*GZ*GZ) + +#define hist_1gram sht_hist_1gram +#define hist_2gram sht_hist_2gram +#define hist_3gram sht_hist_3gram +#define NGsignature NGsignature_sht +#define NGand NGand_sht +#define NGandselect NGandselect_sht +#define NGRAM_TYPE sht +#define NGRAM_TYPEID TYPE_sht +#define NGRAM_TYPENIL sht_nil +#define NGRAM_CST +#define NGRAM_BITS 15 +#include "ngram.h" + +#undef hist_1gram +#undef hist_2gram +#undef hist_3gram +#undef NGsignature +#undef NGand +#undef NGandselect +#undef NGRAM_TYPE +#undef NGRAM_TYPEID +#undef NGRAM_TYPENIL +#undef NGRAM_CST +#undef NGRAM_BITS + +#define hist_1gram int_hist_1gram +#define hist_2gram int_hist_2gram +#define hist_3gram int_hist_3gram +#define NGsignature NGsignature_int +#define NGand NGand_int +#define NGandselect NGandselect_int +#define NGRAM_TYPE int +#define NGRAM_TYPEID TYPE_int +#define NGRAM_TYPENIL int_nil +#define NGRAM_CST +#define NGRAM_BITS 31 +#include "ngram.h" + +#undef hist_1gram +#undef hist_2gram +#undef hist_3gram +#undef NGsignature +#undef NGand +#undef NGandselect +#undef NGRAM_TYPE +#undef NGRAM_TYPEID +#undef NGRAM_TYPENIL +#undef NGRAM_CST +#undef NGRAM_BITS + +#define hist_1gram lng_hist_1gram +#define hist_2gram lng_hist_2gram +#define hist_3gram lng_hist_3gram +#define NGsignature NGsignature_lng +#define NGand NGand_lng +#define NGandselect NGandselect_lng +#define NGRAM_TYPE lng +#define NGRAM_TYPEID TYPE_lng +#define NGRAM_TYPENIL lng_nil +#define NGRAM_CST(v) LL_CONSTANT(v) +#define NGRAM_BITS 63 +#include "ngram.h" + +#undef hist_1gram +#undef hist_2gram +#undef hist_3gram +#undef NGsignature +#undef NGand +#undef NGandselect +#undef NGRAM_TYPE +#undef NGRAM_TYPEID +#undef NGRAM_TYPENIL +#undef NGRAM_CST +#undef NGRAM_BITS + +#define hist_1gram hge_hist_1gram +#define hist_2gram hge_hist_2gram +#define hist_3gram hge_hist_3gram +#define NGsignature NGsignature_hge +#define NGand NGand_hge +#define NGandselect NGandselect_hge +#define NGRAM_TYPE hge +#define NGRAM_TYPEID TYPE_hge +#define NGRAM_TYPENIL hge_nil +#define NGRAM_CST(v) ((hge)LL_CONSTANT(v)) +#define NGRAM_BITS 127 +#include "ngram.h" + +#undef hist_1gram +#undef hist_2gram +#undef hist_3gram +#undef NGsignature +#undef NGand +#undef NGandselect +#undef NGRAM_TYPE +#undef NGRAM_TYPEID +#undef NGRAM_TYPENIL +#undef NGRAM_CST +#undef NGRAM_BITS + +static str +NGandjoin_intern(bat *L, bat *R, bat *sigs, bat *needle, bat *lc, bat *rc, bit *nil_matches, lng *estimate, bit *anti) +{ + (void)L; + (void)R; + (void)sigs; + (void)needle; + (void)lc; + (void)rc; + (void)nil_matches; + (void)estimate; + (void)anti; + return MAL_SUCCEED; +} + +static str +NGandjoin1(bat *L, bat *sigs, bat *needle, bat *lc, bit *nil_matches, lng *estimate, bit *anti) +{ + return NGandjoin_intern(L, NULL, sigs, needle, lc, NULL, nil_matches, estimate, anti); +} + +static str +NGandjoin(bat *L, bat *R, bat *sigs, bat *needle, bat *lc, bat *rc, bit *nil_matches, lng *estimate, bit *anti) +{ + return NGandjoin_intern(L, R, sigs, needle, lc, rc, nil_matches, estimate, anti); +} + +static inline int +popcount64(uint64_t x) +{ +#if defined(__GNUC__) + return (uint32_t) __builtin_popcountll(x); +#elif defined(_MSC_VER) + return (uint32_t) __popcnt64(x); +#else + x = (x & 0x5555555555555555ULL) + ((x >> 1) & 0x5555555555555555ULL); + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); + x = (x & 0x0F0F0F0F0F0F0F0FULL) + ((x >> 4) & 0x0F0F0F0F0F0F0F0FULL); + return (x * 0x0101010101010101ULL) >> 56; +#endif +} + +static str +NGpopcnt(int *cnt, lng *v) +{ + *cnt = popcount64(*v); + return MAL_SUCCEED; +} + +static str +NGsignature_dummy( str *sig, str *str, int *n) +{ + (void)sig; + (void)str; + (void)n; + throw(MAL, "ngram.signature", "no scalar version\n"); +} + +static char * +gor_lng(lng *res, const bat *bid) +{ + BAT *b; + lng val = 0; + BUN nval = 0; + + if ((b = BATdescriptor(*bid)) == NULL) + throw(MAL, "gram.gor", RUNTIME_OBJECT_MISSING); + + const lng *vals = (const lng *) Tloc(b, 0); + for (BUN i = 0, n = BATcount(b); i < n; i++) { + if (is_lng_nil(vals[i])) + continue; /* nils are ignored */ + if (vals[i] == 0) { + /* any value zero is easy: result is zero */ + BBPunfix(b->batCacheid); + *res = 0; + return MAL_SUCCEED; + } + if (vals[i] < 0) { + val |= -vals[i]; + } else { + val |= vals[i]; + } + nval++; /* count non-nil values */ + } + BBPunfix(b->batCacheid); + if (nval == 0) { + /* if there are no non-nil values, the result is nil */ + *res = lng_nil; + } else { + *res = val; + } + return MAL_SUCCEED; +} + +static char * +subgrouped_gor_cand_lng(bat *retval, const bat *bid, const bat *gid, + const bat *eid, const bat *sid, + const bit *skip_nils) +{ + BAT *b, *bn; /* these two are always assigned */ + BAT *g = NULL; /* these three are optional and may not ... */ + BAT *e = NULL; /* ... be assigned to below, ... */ + BAT *s = NULL; /* ... so we initialize them here */ + + /* we ignore these two inputs */ + (void) skip_nils; + + /* the bat we're supposed to be working on (bid) is not + * optional, but the others are, so we test whether the bat id + * is not nil, and if it isn't, whether we can find the BAT + * descriptor */ + if ((b = BATdescriptor(*bid)) == NULL || + (gid && !is_bat_nil(*gid) && (g = BATdescriptor(*gid)) == NULL) || + (eid && !is_bat_nil(*eid) && (e = BATdescriptor(*eid)) == NULL) || + (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL)) { + if (b) + BBPunfix(b->batCacheid); + if (g) + BBPunfix(g->batCacheid); + if (e) + BBPunfix(e->batCacheid); + if (s) + BBPunfix(s->batCacheid); + throw(MAL, "ngram.gor", RUNTIME_OBJECT_MISSING); + } + + oid min, max; /* min and max group id */ + BUN ngrp; /* number of groups, number of candidates */ + struct canditer ci; /* candidate list iterator */ + const char *err; /* error message */ + err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp, &ci); + if (err != NULL) { + BBPunfix(b->batCacheid); + if (g) + BBPunfix(g->batCacheid); + if (e) + BBPunfix(e->batCacheid); + if (s) + BBPunfix(s->batCacheid); + throw(MAL, "ngram.gor", "%s\n", err); + } + + /* create a result BAT and initialize it with all zeros */ + bn = BATconstant(min, TYPE_lng, &(lng){0}, ngrp, TRANSIENT); + if (bn == NULL) { + BBPunfix(b->batCacheid); + if (g) + BBPunfix(g->batCacheid); + if (e) + BBPunfix(e->batCacheid); _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org