Changeset: 4086ef73cf38 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/4086ef73cf38
Added Files:
        monetdb5/modules/mal/ngram.c
        monetdb5/modules/mal/ngram.h
Modified Files:
        monetdb5/modules/mal/CMakeLists.txt
        sql/scripts/48_txtsim.sql
Branch: strimps_v3
Log Message:

Integrate module. Compiling. WIP


diffs (truncated from 2069 to 300 lines):

diff --git a/monetdb5/modules/mal/CMakeLists.txt 
b/monetdb5/modules/mal/CMakeLists.txt
--- a/monetdb5/modules/mal/CMakeLists.txt
+++ b/monetdb5/modules/mal/CMakeLists.txt
@@ -40,7 +40,8 @@ target_sources(malmodules
   tracer.c
   projectionpath.c
   tablet.c tablet.h
-  batcalc.c calc.c)
+  batcalc.c calc.c
+  ngram.c ngram.h)
 
 target_include_directories(malmodules
   PRIVATE
diff --git a/monetdb5/modules/mal/ngram.c b/monetdb5/modules/mal/ngram.c
new file mode 100644
--- /dev/null
+++ b/monetdb5/modules/mal/ngram.c
@@ -0,0 +1,1637 @@
+/*
+ * SPDX-License-Identifier: MPL-2.0
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0.  If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright 2024 MonetDB Foundation;
+ * Copyright August 2008 - 2023 MonetDB B.V.;
+ * Copyright 1997 - July 2008 CWI.
+ */
+
+#include <monetdb_config.h>
+#include <mal_exception.h>
+#include <gdk_cand.h>
+#include <gdk_atoms.h>
+#include <string.h>
+
+#define M 1000000
+#if 0
+#define GZ 128
+#define CHAR_MAP(s) (s&127)
+#else
+#define GZ 64
+#define CHAR_MAP(s) (s&63)
+#endif
+#define SZ_1GRAM GZ
+#define SZ_2GRAM (GZ*GZ)
+#define SZ_3GRAM (GZ*GZ*GZ)
+#define SZ_4GRAM ((size_t)GZ*GZ*GZ*GZ)
+
+#define hist_1gram sht_hist_1gram
+#define hist_2gram sht_hist_2gram
+#define hist_3gram sht_hist_3gram
+#define NGsignature NGsignature_sht
+#define NGand NGand_sht
+#define NGandselect NGandselect_sht
+#define NGRAM_TYPE sht
+#define NGRAM_TYPEID TYPE_sht
+#define NGRAM_TYPENIL sht_nil
+#define NGRAM_CST
+#define NGRAM_BITS 15
+#include "ngram.h"
+
+#undef hist_1gram
+#undef hist_2gram
+#undef hist_3gram
+#undef NGsignature
+#undef NGand
+#undef NGandselect
+#undef NGRAM_TYPE
+#undef NGRAM_TYPEID
+#undef NGRAM_TYPENIL
+#undef NGRAM_CST
+#undef NGRAM_BITS
+
+#define hist_1gram int_hist_1gram
+#define hist_2gram int_hist_2gram
+#define hist_3gram int_hist_3gram
+#define NGsignature NGsignature_int
+#define NGand NGand_int
+#define NGandselect NGandselect_int
+#define NGRAM_TYPE int
+#define NGRAM_TYPEID TYPE_int
+#define NGRAM_TYPENIL int_nil
+#define NGRAM_CST
+#define NGRAM_BITS 31
+#include "ngram.h"
+
+#undef hist_1gram
+#undef hist_2gram
+#undef hist_3gram
+#undef NGsignature
+#undef NGand
+#undef NGandselect
+#undef NGRAM_TYPE
+#undef NGRAM_TYPEID
+#undef NGRAM_TYPENIL
+#undef NGRAM_CST
+#undef NGRAM_BITS
+
+#define hist_1gram lng_hist_1gram
+#define hist_2gram lng_hist_2gram
+#define hist_3gram lng_hist_3gram
+#define NGsignature NGsignature_lng
+#define NGand NGand_lng
+#define NGandselect NGandselect_lng
+#define NGRAM_TYPE lng
+#define NGRAM_TYPEID TYPE_lng
+#define NGRAM_TYPENIL lng_nil
+#define NGRAM_CST(v) LL_CONSTANT(v)
+#define NGRAM_BITS 63
+#include "ngram.h"
+
+#undef hist_1gram
+#undef hist_2gram
+#undef hist_3gram
+#undef NGsignature
+#undef NGand
+#undef NGandselect
+#undef NGRAM_TYPE
+#undef NGRAM_TYPEID
+#undef NGRAM_TYPENIL
+#undef NGRAM_CST
+#undef NGRAM_BITS
+
+#define hist_1gram hge_hist_1gram
+#define hist_2gram hge_hist_2gram
+#define hist_3gram hge_hist_3gram
+#define NGsignature NGsignature_hge
+#define NGand NGand_hge
+#define NGandselect NGandselect_hge
+#define NGRAM_TYPE hge
+#define NGRAM_TYPEID TYPE_hge
+#define NGRAM_TYPENIL hge_nil
+#define NGRAM_CST(v) ((hge)LL_CONSTANT(v))
+#define NGRAM_BITS 127
+#include "ngram.h"
+
+#undef hist_1gram
+#undef hist_2gram
+#undef hist_3gram
+#undef NGsignature
+#undef NGand
+#undef NGandselect
+#undef NGRAM_TYPE
+#undef NGRAM_TYPEID
+#undef NGRAM_TYPENIL
+#undef NGRAM_CST
+#undef NGRAM_BITS
+
+static str
+NGandjoin_intern(bat *L, bat *R, bat *sigs, bat *needle, bat *lc, bat *rc, bit 
*nil_matches, lng *estimate, bit *anti)
+{
+       (void)L;
+       (void)R;
+       (void)sigs;
+       (void)needle;
+       (void)lc;
+       (void)rc;
+       (void)nil_matches;
+       (void)estimate;
+       (void)anti;
+       return MAL_SUCCEED;
+}
+
+static str
+NGandjoin1(bat *L, bat *sigs, bat *needle, bat *lc, bit *nil_matches, lng 
*estimate, bit *anti)
+{
+       return NGandjoin_intern(L, NULL, sigs, needle, lc, NULL, nil_matches, 
estimate, anti);
+}
+
+static str
+NGandjoin(bat *L, bat *R, bat *sigs, bat *needle, bat *lc, bat *rc, bit 
*nil_matches, lng *estimate, bit *anti)
+{
+       return NGandjoin_intern(L, R, sigs, needle, lc, rc, nil_matches, 
estimate, anti);
+}
+
+static inline int
+popcount64(uint64_t x)
+{
+#if defined(__GNUC__)
+    return (uint32_t) __builtin_popcountll(x);
+#elif defined(_MSC_VER)
+    return (uint32_t) __popcnt64(x);
+#else
+    x = (x & 0x5555555555555555ULL) + ((x >> 1) & 0x5555555555555555ULL);
+    x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
+    x = (x & 0x0F0F0F0F0F0F0F0FULL) + ((x >> 4) & 0x0F0F0F0F0F0F0F0FULL);
+    return (x * 0x0101010101010101ULL) >> 56;
+#endif
+}
+
+static str
+NGpopcnt(int *cnt, lng *v)
+{
+       *cnt = popcount64(*v);
+       return MAL_SUCCEED;
+}
+
+static str
+NGsignature_dummy( str *sig, str *str, int *n)
+{
+       (void)sig;
+       (void)str;
+       (void)n;
+       throw(MAL, "ngram.signature", "no scalar version\n");
+}
+
+static char *
+gor_lng(lng *res, const bat *bid)
+{
+       BAT *b;
+       lng val = 0;
+       BUN nval = 0;
+
+       if ((b = BATdescriptor(*bid)) == NULL)
+               throw(MAL, "gram.gor", RUNTIME_OBJECT_MISSING);
+
+       const lng *vals = (const lng *) Tloc(b, 0);
+       for (BUN i = 0, n = BATcount(b); i < n; i++) {
+               if (is_lng_nil(vals[i]))
+                       continue; /* nils are ignored */
+               if (vals[i] == 0) {
+                       /* any value zero is easy: result is zero */
+                       BBPunfix(b->batCacheid);
+                       *res = 0;
+                       return MAL_SUCCEED;
+               }
+               if (vals[i] < 0) {
+                       val |= -vals[i];
+               } else {
+                       val |= vals[i];
+               }
+               nval++;         /* count non-nil values */
+       }
+       BBPunfix(b->batCacheid);
+       if (nval == 0) {
+               /* if there are no non-nil values, the result is nil */
+               *res = lng_nil;
+       } else {
+               *res = val;
+       }
+       return MAL_SUCCEED;
+}
+
+static char *
+subgrouped_gor_cand_lng(bat *retval, const bat *bid, const bat *gid,
+                       const bat *eid, const bat *sid,
+                       const bit *skip_nils)
+{
+       BAT *b, *bn;            /* these two are always assigned */
+       BAT *g = NULL;          /* these three are optional and may not ... */
+       BAT *e = NULL;          /* ... be assigned to below, ... */
+       BAT *s = NULL;          /* ... so we initialize them here */
+
+       /* we ignore these two inputs */
+       (void) skip_nils;
+
+       /* the bat we're supposed to be working on (bid) is not
+        * optional, but the others are, so we test whether the bat id
+        * is not nil, and if it isn't, whether we can find the BAT
+        * descriptor */
+       if ((b = BATdescriptor(*bid)) == NULL ||
+           (gid && !is_bat_nil(*gid) && (g = BATdescriptor(*gid)) == NULL) ||
+           (eid && !is_bat_nil(*eid) && (e = BATdescriptor(*eid)) == NULL) ||
+           (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL)) {
+               if (b)
+                       BBPunfix(b->batCacheid);
+               if (g)
+                       BBPunfix(g->batCacheid);
+               if (e)
+                       BBPunfix(e->batCacheid);
+               if (s)
+                       BBPunfix(s->batCacheid);
+               throw(MAL, "ngram.gor", RUNTIME_OBJECT_MISSING);
+       }
+
+       oid min, max;           /* min and max group id */
+       BUN ngrp;       /* number of groups, number of candidates */
+       struct canditer ci;     /* candidate list iterator */
+       const char *err;        /* error message */
+       err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp, &ci);
+       if (err != NULL) {
+               BBPunfix(b->batCacheid);
+               if (g)
+                       BBPunfix(g->batCacheid);
+               if (e)
+                       BBPunfix(e->batCacheid);
+               if (s)
+                       BBPunfix(s->batCacheid);
+               throw(MAL, "ngram.gor", "%s\n", err);
+       }
+
+       /* create a result BAT and initialize it with all zeros */
+       bn = BATconstant(min, TYPE_lng, &(lng){0}, ngrp, TRANSIENT);
+       if (bn == NULL) {
+               BBPunfix(b->batCacheid);
+               if (g)
+                       BBPunfix(g->batCacheid);
+               if (e)
+                       BBPunfix(e->batCacheid);
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

Reply via email to