Changeset: d85a5982f7d4 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/d85a5982f7d4
Modified Files:
        monetdb5/modules/atoms/str.c
        monetdb5/modules/kernel/batstr.c
        monetdb5/modules/mal/txtsim.c
Branch: txtsim
Log Message:

Asciify string function. BATSTR proto.


diffs (truncated from 402 to 300 lines):

diff --git a/monetdb5/modules/atoms/str.c b/monetdb5/modules/atoms/str.c
--- a/monetdb5/modules/atoms/str.c
+++ b/monetdb5/modules/atoms/str.c
@@ -64,6 +64,10 @@
 #include "str.h"
 #include <string.h>
 
+#if HAVE_ICONV
+#include <iconv.h>
+#endif
+
 /*
  * UTF-8 Handling
  * UTF-8 is a way to store Unicode strings in zero-terminated byte
@@ -4745,6 +4749,44 @@ STRspace(str *res, const int *ll)
        return msg;
 }
 
+static str
+STRasciify(str *r, const str *s)
+{
+#if HAVE_ICONV
+       /* Handle NULL and return early */
+       if (strNil(*s)) {
+               if ((*r = GDKstrdup(str_nil)) == NULL)
+                       throw(MAL, "str.asciify", SQLSTATE(HY013) 
MAL_MALLOC_FAIL);
+               else
+                       return MAL_SUCCEED;
+       }
+       iconv_t cd;
+       const str f = "UTF8", t = "ASCII//TRANSLIT";
+       str in = *s, out;
+       /* Output string length LEN+1 when converting from UTF-8 TO ASCII
+          should be enough. If for some reason LEN is needed is totality,
+          +1 safeguards the \0.*/
+       size_t in_len = strlen(in), out_len = in_len + 1;
+       /* man iconv; /TRANSLIT */
+       if ((cd = iconv_open(t, f)) == (iconv_t)(-1))
+               throw(MAL, "str.asciify", "Cannot convert from (%s) to (%s).", 
f, t);
+       if ((*r = out = GDKmalloc(out_len)) == NULL)
+               throw(MAL, "str.asciify", SQLSTATE(HY013) MAL_MALLOC_FAIL);
+       size_t x = iconv(cd, &in, &in_len, &out, &out_len);
+       if (x == (size_t)-1) {
+               GDKfree(out);
+               *r = NULL;
+               iconv_close(cd);
+               throw(MAL, "str.asciify", "ICONV: string conversion failed from 
(%s) to (%s)", f, t);
+       }
+       *out = '\0';
+       iconv_close(cd);
+       return MAL_SUCCEED;
+#else
+       throw(MAL, "str.asciify", "ICONV library not available.");
+#endif
+}
+
 #include "mel.h"
 mel_func str_init_funcs[] = {
  command("str", "str", STRtostr, false, "Noop routine.", args(1,2, 
arg("",str),arg("s",str))),
@@ -4788,6 +4830,7 @@ mel_func str_init_funcs[] = {
  command("str", "repeat", STRrepeat, false, "", args(1,3, 
arg("",str),arg("s2",str),arg("c",int))),
  command("str", "space", STRspace, false, "", args(1,2, 
arg("",str),arg("l",int))),
  command("str", "epilogue", STRepilogue, false, "", args(1,1, arg("",void))),
+ command("str", "asciify", STRasciify, false, "Transform in str from UTF8 to 
ASCII", args(1, 2, arg("out",str), arg("in",str))),
  { .imp=NULL }
 };
 #include "mal_import.h"
diff --git a/monetdb5/modules/kernel/batstr.c b/monetdb5/modules/kernel/batstr.c
--- a/monetdb5/modules/kernel/batstr.c
+++ b/monetdb5/modules/kernel/batstr.c
@@ -4906,6 +4906,18 @@ bailout:
        return msg;
 }
 
+static str
+BATSTRasciify(str *r, const str *s)
+{
+#if HAVE_ICONV
+       (void)r;
+       (void)s;
+       return MAL_SUCCEED;
+#else
+       throw(MAL, "str.asciify", "ICONV library not available.");
+#endif
+}
+
 #include "mel.h"
 mel_func batstr_init_funcs[] = {
  pattern("batstr", "length", STRbatLength, false, "Return the length of a 
string.", args(1,2, batarg("",int),batarg("s",str))),
@@ -5058,6 +5070,7 @@ mel_func batstr_init_funcs[] = {
  pattern("batstr", "repeat", STRbatrepeat_strcst, false, "", args(1,4, 
batarg("",str),arg("s",str),batarg("c",int),batarg("s",oid))),
  pattern("batstr", "space", STRbatSpace, false, "", args(1,2, 
batarg("",str),batarg("l",int))),
  pattern("batstr", "space", STRbatSpace, false, "", args(1,3, 
batarg("",str),batarg("l",int),batarg("s",oid))),
+ command("batstr", "asciify", BATSTRasciify, false, "Transform in str from 
UTF8 to ASCII", args(1, 2, batarg("out",str), batarg("in",str))),
  { .imp=NULL }
 };
 #include "mal_import.h"
diff --git a/monetdb5/modules/mal/txtsim.c b/monetdb5/modules/mal/txtsim.c
--- a/monetdb5/modules/mal/txtsim.c
+++ b/monetdb5/modules/mal/txtsim.c
@@ -9,17 +9,35 @@
  */
 
 /*
- * @f txtsim
- * @t Module providing similarity metrics for strings
- * @a Romulo Goncalves (from M4 to M5)
- * @d 01/12/2007
- * @v 0.1
+ * String Metrics
+ * Module providing similarity metrics for strings.
+ *
+ * NEW:
+ * Levenshtein distance
+ * maxLevenshtein
+ * minLevenshtein (missing... but do it anyway?)
+ * Jaro–Winkler distance
+ * maxJaroWinkler (missing... but do it anyway?)
+ * minJaroWinkler
+ * startsWith
+ * endsWith
+ * endsWith
  *
- * @+ String metrics
- *
- * Provides basic similarity metrics for strings.
- *
+ * ~ New:
+ * levenshtein - levenshtein dist + var op costs (ins/del, replace, transp)
+ * levenshtein - basic levenshtein dist
+ * editdistance - levenshtein alias ?
+ * editdistance2 - duplicate of editdistance ?
+ * similarity - normalized edit distance + minimum(?)
+ * similarity - normalized edit distance
+ * similarity - bulk normalized edit distance
+ * soundex
+ * stringdiff
+ * qgramnormalize
+ * qgramselfjoin
+ * str2grams
  */
+
 #include "monetdb_config.h"
 #include "mal.h"
 #include <string.h>
@@ -29,16 +47,16 @@
 #include "mal_exception.h"
 
 
-#define RETURN_NIL_IF(b,t) \
-       if (b) {\
-          if (ATOMextern(t)) {\
-             *(ptr*) res = (ptr) ATOMnil(t);\
-                 if ( *(ptr *) res == NULL)\
-                       throw(MAL,"txtsim", SQLSTATE(HY013) MAL_MALLOC_FAIL);\
-          } else {\
-             memcpy(res, ATOMnilptr(t), ATOMsize(t));\
-          }\
-          return MAL_SUCCEED; \
+#define RETURN_NIL_IF(b,t)                                                     
                                        \
+       if (b) {                                                                
                                                        \
+               if (ATOMextern(t)) {                                            
                                        \
+                       *(ptr*) res = (ptr) ATOMnil(t);                         
                                \
+                       if ( *(ptr *) res == NULL)                              
                                        \
+                               throw(MAL,"txtsim", SQLSTATE(HY013) 
MAL_MALLOC_FAIL);   \
+               } else {                                                        
                                                        \
+                       memcpy(res, ATOMnilptr(t), ATOMsize(t));                
                        \
+               }                                                               
                                                                \
+               return MAL_SUCCEED;                                             
                                                \
        }
 
 /* =========================================================================
@@ -217,7 +235,7 @@ levenshteinbasic2_impl(int *result, str 
 
 /* set letter values */
 static const int Code[] = { 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0,
-       1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2
+                                                       1, 2, 6, 2, 3, 0, 1, 0, 
2, 0, 2
 };
 
 static inline char
@@ -386,48 +404,48 @@ struct partition {
 };
 
 /* NAME
-       diag - find diagonal path
+   diag - find diagonal path
 
    SYNOPSIS
-       int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
-               struct partition *part);
+   int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
+   struct partition *part);
 
    DESCRIPTION
-       Find the midpoint of the shortest edit script for a specified
-       portion of the two strings.
+   Find the midpoint of the shortest edit script for a specified
+   portion of the two strings.
 
-       Scan from the beginnings of the strings, and simultaneously from
-       the ends, doing a breadth-first search through the space of
-       edit-sequence.  When the two searches meet, we have found the
-       midpoint of the shortest edit sequence.
+   Scan from the beginnings of the strings, and simultaneously from
+   the ends, doing a breadth-first search through the space of
+   edit-sequence.  When the two searches meet, we have found the
+   midpoint of the shortest edit sequence.
 
-       If MINIMAL is nonzero, find the minimal edit script regardless
-       of expense.  Otherwise, if the search is too expensive, use
-       heuristics to stop the search and report a suboptimal answer.
+   If MINIMAL is nonzero, find the minimal edit script regardless
+   of expense.  Otherwise, if the search is too expensive, use
+   heuristics to stop the search and report a suboptimal answer.
 
    RETURNS
-       Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal
-       number XMID - YMID equals the number of inserted characters
-       minus the number of deleted characters (counting only characters
-       before the midpoint).  Return the approximate edit cost; this is
-       the total number of characters inserted or deleted (counting
-       only characters before the midpoint), unless a heuristic is used
-       to terminate the search prematurely.
+   Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal
+   number XMID - YMID equals the number of inserted characters
+   minus the number of deleted characters (counting only characters
+   before the midpoint).  Return the approximate edit cost; this is
+   the total number of characters inserted or deleted (counting
+   only characters before the midpoint), unless a heuristic is used
+   to terminate the search prematurely.
 
-       Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
-       for the left half of the partition is known; similarly for
-       PART->RIGHT_MINIMAL.
+   Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
+   for the left half of the partition is known; similarly for
+   PART->RIGHT_MINIMAL.
 
    CAVEAT
-       This function assumes that the first characters of the specified
-       portions of the two strings do not match, and likewise that the
-       last characters do not match.  The caller must trim matching
-       characters from the beginning and end of the portions it is
-       going to specify.
+   This function assumes that the first characters of the specified
+   portions of the two strings do not match, and likewise that the
+   last characters do not match.  The caller must trim matching
+   characters from the beginning and end of the portions it is
+   going to specify.
 
-       If we return the "wrong" partitions, the worst this can do is
-       cause suboptimal diff output.  It cannot cause incorrect diff
-       output.  */
+   If we return the "wrong" partitions, the worst this can do is
+   cause suboptimal diff output.  It cannot cause incorrect diff
+   output.  */
 
 static inline int
 diag(int xoff, int xlim, int yoff, int ylim, int minimal, struct partition 
*part, int too_expensive, struct string_data *string, int *fdiag, int *bdiag)
@@ -594,23 +612,23 @@ diag(int xoff, int xlim, int yoff, int y
 
 
 /* NAME
-       compareseq - find edit sequence
+   compareseq - find edit sequence
 
    SYNOPSIS
-       void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
+   void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
 
    DESCRIPTION
-       Compare in detail contiguous subsequences of the two strings
-       which are known, as a whole, to match each other.
+   Compare in detail contiguous subsequences of the two strings
+   which are known, as a whole, to match each other.
 
-       The subsequence of string 0 is [XOFF, XLIM) and likewise for
-       string 1.
+   The subsequence of string 0 is [XOFF, XLIM) and likewise for
+   string 1.
 
-       Note that XLIM, YLIM are exclusive bounds.  All character
-       numbers are origin-0.
+   Note that XLIM, YLIM are exclusive bounds.  All character
+   numbers are origin-0.
 
-       If MINIMAL is nonzero, find a minimal difference no matter how
-       expensive it is.  */
+   If MINIMAL is nonzero, find a minimal difference no matter how
+   expensive it is.  */
 
 static inline void
 compareseq(int xoff, int xlim, int yoff, int ylim, int minimal, int max_edits, 
int too_expensive, struct string_data *string, int *fdiag, int *bdiag) /* 
compareseq stops when edits > max_edits */
@@ -666,35 +684,35 @@ compareseq(int xoff, int xlim, int yoff,
 }
 
 /* NAME
-       fstrcmp - fuzzy string compare
+   fstrcmp - fuzzy string compare
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

Reply via email to