Changeset: d85a5982f7d4 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/d85a5982f7d4 Modified Files: monetdb5/modules/atoms/str.c monetdb5/modules/kernel/batstr.c monetdb5/modules/mal/txtsim.c Branch: txtsim Log Message:
Asciify string function. BATSTR proto. diffs (truncated from 402 to 300 lines): diff --git a/monetdb5/modules/atoms/str.c b/monetdb5/modules/atoms/str.c --- a/monetdb5/modules/atoms/str.c +++ b/monetdb5/modules/atoms/str.c @@ -64,6 +64,10 @@ #include "str.h" #include <string.h> +#if HAVE_ICONV +#include <iconv.h> +#endif + /* * UTF-8 Handling * UTF-8 is a way to store Unicode strings in zero-terminated byte @@ -4745,6 +4749,44 @@ STRspace(str *res, const int *ll) return msg; } +static str +STRasciify(str *r, const str *s) +{ +#if HAVE_ICONV + /* Handle NULL and return early */ + if (strNil(*s)) { + if ((*r = GDKstrdup(str_nil)) == NULL) + throw(MAL, "str.asciify", SQLSTATE(HY013) MAL_MALLOC_FAIL); + else + return MAL_SUCCEED; + } + iconv_t cd; + const str f = "UTF8", t = "ASCII//TRANSLIT"; + str in = *s, out; + /* Output string length LEN+1 when converting from UTF-8 TO ASCII + should be enough. If for some reason LEN is needed is totality, + +1 safeguards the \0.*/ + size_t in_len = strlen(in), out_len = in_len + 1; + /* man iconv; /TRANSLIT */ + if ((cd = iconv_open(t, f)) == (iconv_t)(-1)) + throw(MAL, "str.asciify", "Cannot convert from (%s) to (%s).", f, t); + if ((*r = out = GDKmalloc(out_len)) == NULL) + throw(MAL, "str.asciify", SQLSTATE(HY013) MAL_MALLOC_FAIL); + size_t x = iconv(cd, &in, &in_len, &out, &out_len); + if (x == (size_t)-1) { + GDKfree(out); + *r = NULL; + iconv_close(cd); + throw(MAL, "str.asciify", "ICONV: string conversion failed from (%s) to (%s)", f, t); + } + *out = '\0'; + iconv_close(cd); + return MAL_SUCCEED; +#else + throw(MAL, "str.asciify", "ICONV library not available."); +#endif +} + #include "mel.h" mel_func str_init_funcs[] = { command("str", "str", STRtostr, false, "Noop routine.", args(1,2, arg("",str),arg("s",str))), @@ -4788,6 +4830,7 @@ mel_func str_init_funcs[] = { command("str", "repeat", STRrepeat, false, "", args(1,3, arg("",str),arg("s2",str),arg("c",int))), command("str", "space", STRspace, false, "", args(1,2, arg("",str),arg("l",int))), command("str", "epilogue", STRepilogue, false, "", args(1,1, arg("",void))), + command("str", "asciify", STRasciify, false, "Transform in str from UTF8 to ASCII", args(1, 2, arg("out",str), arg("in",str))), { .imp=NULL } }; #include "mal_import.h" diff --git a/monetdb5/modules/kernel/batstr.c b/monetdb5/modules/kernel/batstr.c --- a/monetdb5/modules/kernel/batstr.c +++ b/monetdb5/modules/kernel/batstr.c @@ -4906,6 +4906,18 @@ bailout: return msg; } +static str +BATSTRasciify(str *r, const str *s) +{ +#if HAVE_ICONV + (void)r; + (void)s; + return MAL_SUCCEED; +#else + throw(MAL, "str.asciify", "ICONV library not available."); +#endif +} + #include "mel.h" mel_func batstr_init_funcs[] = { pattern("batstr", "length", STRbatLength, false, "Return the length of a string.", args(1,2, batarg("",int),batarg("s",str))), @@ -5058,6 +5070,7 @@ mel_func batstr_init_funcs[] = { pattern("batstr", "repeat", STRbatrepeat_strcst, false, "", args(1,4, batarg("",str),arg("s",str),batarg("c",int),batarg("s",oid))), pattern("batstr", "space", STRbatSpace, false, "", args(1,2, batarg("",str),batarg("l",int))), pattern("batstr", "space", STRbatSpace, false, "", args(1,3, batarg("",str),batarg("l",int),batarg("s",oid))), + command("batstr", "asciify", BATSTRasciify, false, "Transform in str from UTF8 to ASCII", args(1, 2, batarg("out",str), batarg("in",str))), { .imp=NULL } }; #include "mal_import.h" diff --git a/monetdb5/modules/mal/txtsim.c b/monetdb5/modules/mal/txtsim.c --- a/monetdb5/modules/mal/txtsim.c +++ b/monetdb5/modules/mal/txtsim.c @@ -9,17 +9,35 @@ */ /* - * @f txtsim - * @t Module providing similarity metrics for strings - * @a Romulo Goncalves (from M4 to M5) - * @d 01/12/2007 - * @v 0.1 + * String Metrics + * Module providing similarity metrics for strings. + * + * NEW: + * Levenshtein distance + * maxLevenshtein + * minLevenshtein (missing... but do it anyway?) + * Jaro–Winkler distance + * maxJaroWinkler (missing... but do it anyway?) + * minJaroWinkler + * startsWith + * endsWith + * endsWith * - * @+ String metrics - * - * Provides basic similarity metrics for strings. - * + * ~ New: + * levenshtein - levenshtein dist + var op costs (ins/del, replace, transp) + * levenshtein - basic levenshtein dist + * editdistance - levenshtein alias ? + * editdistance2 - duplicate of editdistance ? + * similarity - normalized edit distance + minimum(?) + * similarity - normalized edit distance + * similarity - bulk normalized edit distance + * soundex + * stringdiff + * qgramnormalize + * qgramselfjoin + * str2grams */ + #include "monetdb_config.h" #include "mal.h" #include <string.h> @@ -29,16 +47,16 @@ #include "mal_exception.h" -#define RETURN_NIL_IF(b,t) \ - if (b) {\ - if (ATOMextern(t)) {\ - *(ptr*) res = (ptr) ATOMnil(t);\ - if ( *(ptr *) res == NULL)\ - throw(MAL,"txtsim", SQLSTATE(HY013) MAL_MALLOC_FAIL);\ - } else {\ - memcpy(res, ATOMnilptr(t), ATOMsize(t));\ - }\ - return MAL_SUCCEED; \ +#define RETURN_NIL_IF(b,t) \ + if (b) { \ + if (ATOMextern(t)) { \ + *(ptr*) res = (ptr) ATOMnil(t); \ + if ( *(ptr *) res == NULL) \ + throw(MAL,"txtsim", SQLSTATE(HY013) MAL_MALLOC_FAIL); \ + } else { \ + memcpy(res, ATOMnilptr(t), ATOMsize(t)); \ + } \ + return MAL_SUCCEED; \ } /* ========================================================================= @@ -217,7 +235,7 @@ levenshteinbasic2_impl(int *result, str /* set letter values */ static const int Code[] = { 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0, - 1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2 + 1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2 }; static inline char @@ -386,48 +404,48 @@ struct partition { }; /* NAME - diag - find diagonal path + diag - find diagonal path SYNOPSIS - int diag(int xoff, int xlim, int yoff, int ylim, int minimal, - struct partition *part); + int diag(int xoff, int xlim, int yoff, int ylim, int minimal, + struct partition *part); DESCRIPTION - Find the midpoint of the shortest edit script for a specified - portion of the two strings. + Find the midpoint of the shortest edit script for a specified + portion of the two strings. - Scan from the beginnings of the strings, and simultaneously from - the ends, doing a breadth-first search through the space of - edit-sequence. When the two searches meet, we have found the - midpoint of the shortest edit sequence. + Scan from the beginnings of the strings, and simultaneously from + the ends, doing a breadth-first search through the space of + edit-sequence. When the two searches meet, we have found the + midpoint of the shortest edit sequence. - If MINIMAL is nonzero, find the minimal edit script regardless - of expense. Otherwise, if the search is too expensive, use - heuristics to stop the search and report a suboptimal answer. + If MINIMAL is nonzero, find the minimal edit script regardless + of expense. Otherwise, if the search is too expensive, use + heuristics to stop the search and report a suboptimal answer. RETURNS - Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal - number XMID - YMID equals the number of inserted characters - minus the number of deleted characters (counting only characters - before the midpoint). Return the approximate edit cost; this is - the total number of characters inserted or deleted (counting - only characters before the midpoint), unless a heuristic is used - to terminate the search prematurely. + Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal + number XMID - YMID equals the number of inserted characters + minus the number of deleted characters (counting only characters + before the midpoint). Return the approximate edit cost; this is + the total number of characters inserted or deleted (counting + only characters before the midpoint), unless a heuristic is used + to terminate the search prematurely. - Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script - for the left half of the partition is known; similarly for - PART->RIGHT_MINIMAL. + Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script + for the left half of the partition is known; similarly for + PART->RIGHT_MINIMAL. CAVEAT - This function assumes that the first characters of the specified - portions of the two strings do not match, and likewise that the - last characters do not match. The caller must trim matching - characters from the beginning and end of the portions it is - going to specify. + This function assumes that the first characters of the specified + portions of the two strings do not match, and likewise that the + last characters do not match. The caller must trim matching + characters from the beginning and end of the portions it is + going to specify. - If we return the "wrong" partitions, the worst this can do is - cause suboptimal diff output. It cannot cause incorrect diff - output. */ + If we return the "wrong" partitions, the worst this can do is + cause suboptimal diff output. It cannot cause incorrect diff + output. */ static inline int diag(int xoff, int xlim, int yoff, int ylim, int minimal, struct partition *part, int too_expensive, struct string_data *string, int *fdiag, int *bdiag) @@ -594,23 +612,23 @@ diag(int xoff, int xlim, int yoff, int y /* NAME - compareseq - find edit sequence + compareseq - find edit sequence SYNOPSIS - void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal); + void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal); DESCRIPTION - Compare in detail contiguous subsequences of the two strings - which are known, as a whole, to match each other. + Compare in detail contiguous subsequences of the two strings + which are known, as a whole, to match each other. - The subsequence of string 0 is [XOFF, XLIM) and likewise for - string 1. + The subsequence of string 0 is [XOFF, XLIM) and likewise for + string 1. - Note that XLIM, YLIM are exclusive bounds. All character - numbers are origin-0. + Note that XLIM, YLIM are exclusive bounds. All character + numbers are origin-0. - If MINIMAL is nonzero, find a minimal difference no matter how - expensive it is. */ + If MINIMAL is nonzero, find a minimal difference no matter how + expensive it is. */ static inline void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal, int max_edits, int too_expensive, struct string_data *string, int *fdiag, int *bdiag) /* compareseq stops when edits > max_edits */ @@ -666,35 +684,35 @@ compareseq(int xoff, int xlim, int yoff, } /* NAME - fstrcmp - fuzzy string compare + fstrcmp - fuzzy string compare _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org