Changeset: 3a64271e8751 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/3a64271e8751 Removed Files: sql/test/BugTracker-2020/Tests/txtsim-parallel.Bug-7016.test Modified Files: clients/Tests/MAL-signatures-hge.test clients/Tests/MAL-signatures.test monetdb5/modules/mal/txtsim.c sql/scripts/48_txtsim.sql sql/test/BugTracker-2016/Tests/DISTINCT_with_correlated_scalar_subquery_crashes_mserver.Bug-3920.test sql/test/emptydb-previous-upgrade-chain-hge/Tests/upgrade.stable.out.int128 sql/test/emptydb-previous-upgrade-chain/Tests/upgrade.stable.out sql/test/emptydb-previous-upgrade-chain/Tests/upgrade.stable.out.int128 sql/test/emptydb-previous-upgrade-hge/Tests/upgrade.stable.out.int128 sql/test/emptydb-previous-upgrade/Tests/upgrade.stable.out sql/test/emptydb-previous-upgrade/Tests/upgrade.stable.out.int128 sql/test/emptydb-upgrade-chain-hge/Tests/upgrade.stable.out.int128 sql/test/emptydb-upgrade-chain/Tests/upgrade.stable.out sql/test/emptydb-upgrade-chain/Tests/upgrade.stable.out.int128 sql/test/emptydb-upgrade-chain/Tests/upgrade.stable.out.ppc64.int128 sql/test/emptydb-upgrade-hge/Tests/upgrade.stable.out.int128 sql/test/emptydb-upgrade/Tests/upgrade.stable.out sql/test/emptydb-upgrade/Tests/upgrade.stable.out.int128 sql/test/emptydb/Tests/check.stable.out sql/test/emptydb/Tests/check.stable.out.32bit sql/test/emptydb/Tests/check.stable.out.int128 sql/test/miscellaneous/Tests/simple_selects.test sql/test/testdb-previous-upgrade-chain-hge/Tests/upgrade.stable.out.int128 sql/test/testdb-previous-upgrade-chain/Tests/upgrade.stable.out sql/test/testdb-previous-upgrade-chain/Tests/upgrade.stable.out.int128 sql/test/testdb-previous-upgrade-hge/Tests/upgrade.stable.out.int128 sql/test/testdb-previous-upgrade/Tests/upgrade.stable.out sql/test/testdb-previous-upgrade/Tests/upgrade.stable.out.int128 sql/test/testdb-upgrade-chain-hge/Tests/upgrade.stable.out.int128 sql/test/testdb-upgrade-chain/Tests/upgrade.stable.out sql/test/testdb-upgrade-chain/Tests/upgrade.stable.out.int128 sql/test/testdb-upgrade-hge/Tests/upgrade.stable.out.int128 sql/test/testdb-upgrade/Tests/upgrade.stable.out sql/test/testdb-upgrade/Tests/upgrade.stable.out.int128 Branch: default Log Message:
Remove similarity functionality (DEPRECATED in Jun2023). diffs (truncated from 1255 to 300 lines): diff --git a/clients/Tests/MAL-signatures-hge.test b/clients/Tests/MAL-signatures-hge.test --- a/clients/Tests/MAL-signatures-hge.test +++ b/clients/Tests/MAL-signatures-hge.test @@ -35568,11 +35568,6 @@ maxlevenshtein pattern battxtsim.maxlevenshtein(X_0:bat[:str], X_1:bat[:str], X_2:int, X_3:int, X_4:int):bat[:bit] BATTXTSIMmaxlevenshtein; Same as maxlevenshtein but for BATS -battxtsim -similarity -command battxtsim.similarity(X_0:bat[:str], X_1:bat[:str]):bat[:dbl] -fstrcmp0_impl_bulk; -(Deprecated) Normalized edit distance between two strings baturl extractURLHost command baturl.extractURLHost(X_0:bat[:str], X_1:bit):bat[:str] @@ -51179,16 +51174,6 @@ command txtsim.qgramselfjoin(X_0:bat[:oi qgram_selfjoin; QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions txtsim -similarity -command txtsim.similarity(X_0:str, X_1:str):dbl -fstrcmp0_impl; -(Deprecated) Normalized edit distance between two strings -txtsim -similarity -command txtsim.similarity(X_0:str, X_1:str, X_2:dbl):dbl -fstrcmp_impl; -(Deprecated) Normalized edit distance between two strings -txtsim soundex command txtsim.soundex(X_0:str):str soundex; diff --git a/clients/Tests/MAL-signatures.test b/clients/Tests/MAL-signatures.test --- a/clients/Tests/MAL-signatures.test +++ b/clients/Tests/MAL-signatures.test @@ -26618,11 +26618,6 @@ maxlevenshtein pattern battxtsim.maxlevenshtein(X_0:bat[:str], X_1:bat[:str], X_2:int, X_3:int, X_4:int):bat[:bit] BATTXTSIMmaxlevenshtein; Same as maxlevenshtein but for BATS -battxtsim -similarity -command battxtsim.similarity(X_0:bat[:str], X_1:bat[:str]):bat[:dbl] -fstrcmp0_impl_bulk; -(Deprecated) Normalized edit distance between two strings baturl extractURLHost command baturl.extractURLHost(X_0:bat[:str], X_1:bit):bat[:str] @@ -39504,16 +39499,6 @@ command txtsim.qgramselfjoin(X_0:bat[:oi qgram_selfjoin; QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions txtsim -similarity -command txtsim.similarity(X_0:str, X_1:str):dbl -fstrcmp0_impl; -(Deprecated) Normalized edit distance between two strings -txtsim -similarity -command txtsim.similarity(X_0:str, X_1:str, X_2:dbl):dbl -fstrcmp_impl; -(Deprecated) Normalized edit distance between two strings -txtsim soundex command txtsim.soundex(X_0:str):str soundex; diff --git a/monetdb5/modules/mal/txtsim.c b/monetdb5/modules/mal/txtsim.c --- a/monetdb5/modules/mal/txtsim.c +++ b/monetdb5/modules/mal/txtsim.c @@ -1391,484 +1391,6 @@ str_2_qgrams(bat *ret, str *val) return MAL_SUCCEED; } -/* DEPRECATED (see DEPRECATED_END) */ -#define INITIAL_INT_BUFFER_LENGTH 2048 -#define CHECK_INT_BUFFER_LENGTH(BUFFER, BUFFER_LEN, NEXT_LEN, OP) \ - do { \ - if ((NEXT_LEN) > *BUFFER_LEN) { \ - size_t newlen = (((NEXT_LEN) + 1023) & ~1023); /* align to a multiple of 1024 bytes */ \ - int *newbuf = GDKmalloc(newlen); \ - if (!newbuf) \ - throw(MAL, OP, SQLSTATE(HY013) MAL_MALLOC_FAIL); \ - GDKfree(*BUFFER); \ - *BUFFER = newbuf; \ - *BUFFER_LEN = newlen; \ - } \ - } while (0) - -struct string_data { - /* The string to be compared. */ - const char *data; - - /* The length of the string to be compared. */ - int data_length; - - /* The number of characters inserted or deleted. */ - int edit_count; -}; - -struct partition { - /* Midpoints of this partition. */ - int xmid, ymid; - - /* Nonzero if low half will be analyzed minimally. */ - int lo_minimal; - - /* Likewise for high half. */ - int hi_minimal; -}; - -/* NAME - diag - find diagonal path - - SYNOPSIS - int diag(int xoff, int xlim, int yoff, int ylim, int minimal, - struct partition *part); - - DESCRIPTION - Find the midpoint of the shortest edit script for a specified - portion of the two strings. - - Scan from the beginnings of the strings, and simultaneously from - the ends, doing a breadth-first search through the space of - edit-sequence. When the two searches meet, we have found the - midpoint of the shortest edit sequence. - - If MINIMAL is nonzero, find the minimal edit script regardless - of expense. Otherwise, if the search is too expensive, use - heuristics to stop the search and report a suboptimal answer. - - RETURNS - Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal - number XMID - YMID equals the number of inserted characters - minus the number of deleted characters (counting only characters - before the midpoint). Return the approximate edit cost; this is - the total number of characters inserted or deleted (counting - only characters before the midpoint), unless a heuristic is used - to terminate the search prematurely. - - Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script - for the left half of the partition is known; similarly for - PART->RIGHT_MINIMAL. - - CAVEAT - This function assumes that the first characters of the specified - portions of the two strings do not match, and likewise that the - last characters do not match. The caller must trim matching - characters from the beginning and end of the portions it is - going to specify. - - If we return the "wrong" partitions, the worst this can do is - cause suboptimal diff output. It cannot cause incorrect diff - output. */ - -static inline int -diag(int xoff, int xlim, int yoff, int ylim, int minimal, struct partition *part, int too_expensive, struct string_data *string, int *fdiag, int *bdiag) -{ - int *const fd = fdiag; /* Give the compiler a chance. */ - int *const bd = bdiag; /* Additional help for the compiler. */ - const char *const xv = string[0].data; /* Still more help for the compiler. */ - const char *const yv = string[1].data; /* And more and more . . . */ - const int dmin = xoff - ylim; /* Minimum valid diagonal. */ - const int dmax = xlim - yoff; /* Maximum valid diagonal. */ - const int fmid = xoff - yoff; /* Center diagonal of top-down search. */ - const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ - int fmin = fmid; - int fmax = fmid; /* Limits of top-down search. */ - int bmin = bmid; - int bmax = bmid; /* Limits of bottom-up search. */ - int c; /* Cost. */ - int odd = (fmid - bmid) & 1; - - /* - * True if southeast corner is on an odd diagonal with respect - * to the northwest. - */ - fd[fmid] = xoff; - bd[bmid] = xlim; - for (c = 1;; ++c) { - int d; /* Active diagonal. */ - - /* Extend the top-down search by an edit step in each diagonal. */ - if (fmin > dmin) - fd[--fmin - 1] = -1; - else - ++fmin; - if (fmax < dmax) - fd[++fmax + 1] = -1; - else - --fmax; - for (d = fmax; d >= fmin; d -= 2) { - int x; - int y; - int tlo; - int thi; - - tlo = fd[d - 1], thi = fd[d + 1]; - - if (tlo >= thi) - x = tlo + 1; - else - x = thi; - y = x - d; - while (x < xlim && y < ylim && xv[x] == yv[y]) { - ++x; - ++y; - } - fd[d] = x; - if (odd && bmin <= d && d <= bmax && bd[d] <= x) { - part->xmid = x; - part->ymid = y; - part->lo_minimal = part->hi_minimal = 1; - return 2 * c - 1; - } - } - /* Similarly extend the bottom-up search. */ - if (bmin > dmin) - bd[--bmin - 1] = INT_MAX; - else - ++bmin; - if (bmax < dmax) - bd[++bmax + 1] = INT_MAX; - else - --bmax; - for (d = bmax; d >= bmin; d -= 2) { - int x; - int y; - int tlo; - int thi; - - tlo = bd[d - 1], thi = bd[d + 1]; - if (tlo < thi) - x = tlo; - else - x = thi - 1; - y = x - d; - while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) { - --x; - --y; - } - bd[d] = x; - if (!odd && fmin <= d && d <= fmax && x <= fd[d]) { - part->xmid = x; - part->ymid = y; - part->lo_minimal = part->hi_minimal = 1; - return 2 * c; - } - } - - if (minimal) - continue; - - /* Heuristic: if we've gone well beyond the call of duty, give up - and report halfway between our best results so far. */ - if (c >= too_expensive) { - int fxybest; - int fxbest; - int bxybest; - int bxbest; - - /* Pacify `gcc -Wall'. */ - fxbest = 0; - bxbest = 0; - - /* Find forward diagonal that maximizes X + Y. */ - fxybest = -1; - for (d = fmax; d >= fmin; d -= 2) { - int x; - int y; - - x = fd[d] < xlim ? fd[d] : xlim; - y = x - d; - - if (ylim < y) { - x = ylim + d; - y = ylim; - } - if (fxybest < x + y) { - fxybest = x + y; - fxbest = x; - } - } - /* Find backward diagonal that minimizes X + Y. */ - bxybest = INT_MAX; - for (d = bmax; d >= bmin; d -= 2) { - int x; - int y; - - x = xoff > bd[d] ? xoff : bd[d]; - y = x - d; - - if (y < yoff) { - x = yoff + d; - y = yoff; - } - if (x + y < bxybest) { - bxybest = x + y; - bxbest = x; - } - } - /* Use the better of the two diagonals. */ - if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) { _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org