Changeset: 54df68af9352 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/54df68af9352
Modified Files:
        monetdb5/modules/atoms/str.c
        monetdb5/modules/kernel/batstr.c
Branch: txtsim
Log Message:

STRreverse and BATSTRreverse implementation.


diffs (219 lines):

diff --git a/monetdb5/modules/atoms/str.c b/monetdb5/modules/atoms/str.c
--- a/monetdb5/modules/atoms/str.c
+++ b/monetdb5/modules/atoms/str.c
@@ -4785,6 +4785,66 @@ STRasciify(str *r, const str *s)
 #endif
 }
 
+static str
+STRreverse(str *ret, const str *arg)
+{
+       str src = *arg;
+       size_t len = strlen(src);
+       str dst = GDKmalloc(len + 1);
+       /* dst is a buffer of length larger than len (i.e. dst[len] exists),
+          src is a UTF-8-encoded string of length exactly len bytes. */
+       if (dst == NULL)
+               throw(MAL, "str.reverse", MAL_MALLOC_FAIL);
+       dst[len] = 0;
+       if (strNil(src)) {
+               /* special case for nil:str */
+               assert(len == strlen(str_nil));
+               strcpy(dst, str_nil);
+               return MAL_SUCCEED;
+       }
+       /* All strings in MonetDB are encoded using UTF-8; we must
+        * make sure that the reversed string is also encoded in valid
+        * UTF-8, so we treat multibyte characters as single units */
+       while (*src) {
+               if ((*src & 0xF8) == 0xF0) {
+                       /* 4 byte UTF-8 sequence */
+                       assert(len >= 4);
+                       dst[len - 4] = *src++;
+                       assert((*src & 0xC0) == 0x80);
+                       dst[len - 3] = *src++;
+                       assert((*src & 0xC0) == 0x80);
+                       dst[len - 2] = *src++;
+                       assert((*src & 0xC0) == 0x80);
+                       dst[len - 1] = *src++;
+                       len -= 4;
+               } else if ((*src & 0xF0) == 0xE0) {
+                       /* 3 byte UTF-8 sequence */
+                       assert(len >= 3);
+                       dst[len - 3] = *src++;
+                       assert((*src & 0xC0) == 0x80);
+                       dst[len - 2] = *src++;
+                       assert((*src & 0xC0) == 0x80);
+                       dst[len - 1] = *src++;
+                       len -= 3;
+               } else if ((*src & 0xE0) == 0xC0) {
+                       /* 2 byte UTF-8 sequence */
+                       assert(len >= 2);
+                       dst[len - 2] = *src++;
+                       assert((*src & 0xC0) == 0x80);
+                       dst[len - 1] = *src++;
+                       len -= 2;
+               } else {
+                       /* 1 byte UTF-8 "sequence" */
+                       assert(len >= 1);
+                       assert((*src & 0x80) == 0);
+                       dst[--len] = *src++;
+               }
+       }
+       assert(len == 0);
+       *ret = dst;
+       return MAL_SUCCEED;
+}
+
 #include "mel.h"
 mel_func str_init_funcs[] = {
  command("str", "str", STRtostr, false, "Noop routine.", args(1,2, 
arg("",str),arg("s",str))),
@@ -4828,7 +4888,8 @@ mel_func str_init_funcs[] = {
  command("str", "repeat", STRrepeat, false, "", args(1,3, 
arg("",str),arg("s2",str),arg("c",int))),
  command("str", "space", STRspace, false, "", args(1,2, 
arg("",str),arg("l",int))),
  command("str", "epilogue", STRepilogue, false, "", args(1,1, arg("",void))),
- command("str", "asciify", STRasciify, false, "Transform in str from UTF8 to 
ASCII", args(1, 2, arg("out",str), arg("in",str))),
+ command("str", "asciify", STRasciify, false, "Transform string from UTF8 to 
ASCII", args(1, 2, arg("out",str), arg("in",str))),
+ command("str", "reverse", STRreverse, false, "Reverse a string", args(1,2, 
arg("out",str),arg("in",str))),
  { .imp=NULL }
 };
 #include "mal_import.h"
diff --git a/monetdb5/modules/kernel/batstr.c b/monetdb5/modules/kernel/batstr.c
--- a/monetdb5/modules/kernel/batstr.c
+++ b/monetdb5/modules/kernel/batstr.c
@@ -4934,7 +4934,7 @@ BATSTRasciify(bat *ret, bat *bid)
                throw(MAL, "batstr.asciify", GDK_EXCEPTION);
        }
        bi = bat_iterator(b);
-       for (start = 0, end = BATcount(b); start < end; start++) {
+       BATloop(b, start, end) {
                in = (str) BUNtail(bi, start);
                if (strNil(in)) {
                        if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) {
@@ -4991,6 +4991,116 @@ BATSTRasciify(bat *ret, bat *bid)
 #endif
 }
 
+static str
+BATSTRreverse(bat *ret, const bat *arg)
+{
+       BAT *b, *bn;
+       BATiter bi;
+       BUN start, end;
+       const char *src;
+       /* Allocate temporary space for reversed strings;
+          we grow this if we need more. */
+       size_t len, dst_len = 1024;
+       int i = -1;
+       str     dst, error[2] = { GDK_EXCEPTION, MAL_MALLOC_FAIL };
+       /* Use zalloc to force valid UTF-8 */
+       if ((dst = GDKzalloc(dst_len)) == NULL)
+               throw(MAL, "batstr.reverse", MAL_MALLOC_FAIL);
+       if ((b = BATdescriptor(*arg)) == NULL) {
+               GDKfree(dst);
+               throw(MAL, "batstr.reverse", RUNTIME_OBJECT_MISSING);
+       }
+       /* We should only get called for string BATs */
+       assert(b->ttype == TYPE_str);
+       /* Allocate result BAT */
+       bn = COLnew(b->hseqbase, TYPE_str, BATcount(b), TRANSIENT);
+       if(bn == NULL) {
+               BBPunfix(b->batCacheid);
+               GDKfree(dst);
+               throw(MAL, "batstr.reverse", MAL_MALLOC_FAIL);
+       }
+       /* Loop through BAT b; 'start' is index of the entry we're working
+          on, 'end' is used internally by BATloop to do the iterating */
+       bi = bat_iterator(b);
+       BATloop(b, start, end) {
+               src = (const char *) BUNtail(bi, start);
+               if (strNil(src)) {
+                       assert(len > strlen(src));
+                       strcpy(dst, str_nil);
+               }
+               else {
+                       len = strlen(src);
+                       /* make sure dst is large enough */
+                       if (len >= dst_len) {
+                               dst_len = len + 1024;
+                               if ((dst = GDKrealloc(dst, dst_len)) == NULL) {
+                                       i = 1;
+                                       goto bail;
+                               }
+                       }
+                       /* All strings in MonetDB are encoded using UTF-8; we 
must
+                        * make sure that the reversed string is also encoded 
in valid
+                        * UTF-8, so we treat multibyte characters as single 
units */
+                       while (*src) {
+                               if ((*src & 0xF8) == 0xF0) {
+                                       /* 4 byte UTF-8 sequence */
+                                       assert(len >= 4);
+                                       dst[len - 4] = *src++;
+                                       assert((*src & 0xC0) == 0x80);
+                                       dst[len - 3] = *src++;
+                                       assert((*src & 0xC0) == 0x80);
+                                       dst[len - 2] = *src++;
+                                       assert((*src & 0xC0) == 0x80);
+                                       dst[len - 1] = *src++;
+                                       len -= 4;
+                               } else if ((*src & 0xF0) == 0xE0) {
+                                       /* 3 byte UTF-8 sequence */
+                                       assert(len >= 3);
+                                       dst[len - 3] = *src++;
+                                       assert((*src & 0xC0) == 0x80);
+                                       dst[len - 2] = *src++;
+                                       assert((*src & 0xC0) == 0x80);
+                                       dst[len - 1] = *src++;
+                                       len -= 3;
+                               } else if ((*src & 0xE0) == 0xC0) {
+                                       /* 2 byte UTF-8 sequence */
+                                       assert(len >= 2);
+                                       dst[len - 2] = *src++;
+                                       assert((*src & 0xC0) == 0x80);
+                                       dst[len - 1] = *src++;
+                                       len -= 2;
+                               } else {
+                                       /* 1 byte UTF-8 "sequence" */
+                                       assert(len >= 1);
+                                       assert((*src & 0x80) == 0);
+                                       dst[--len] = *src++;
+                               }
+                       }
+                       assert(len == 0);
+               }
+               if (BUNappend(bn, dst, false) != GDK_SUCCEED) {
+                       /* BUNappend can fail since it may have to grow memory
+                          areas, especially true for string BATs */
+                       i = 0;
+                       goto bail;
+               }
+       }
+       bat_iterator_end(&bi);
+       GDKfree(dst);
+       BBPunfix(b->batCacheid);
+       *ret = bn->batCacheid;
+       BBPkeepref(bn);
+       return MAL_SUCCEED;
+ bail:
+       /* We only get here in the case of an allocation error;
+          clean up the mess we've created and throw an exception */
+       bat_iterator_end(&bi);
+       GDKfree(dst);
+       BBPunfix(b->batCacheid);
+       BBPunfix(bn->batCacheid);
+       throw(MAL, "batstr.reverse", "%s", error[i]);
+}
+
 #include "mel.h"
 mel_func batstr_init_funcs[] = {
  pattern("batstr", "length", STRbatLength, false, "Return the length of a 
string.", args(1,2, batarg("",int),batarg("s",str))),
@@ -5143,7 +5253,8 @@ mel_func batstr_init_funcs[] = {
  pattern("batstr", "repeat", STRbatrepeat_strcst, false, "", args(1,4, 
batarg("",str),arg("s",str),batarg("c",int),batarg("s",oid))),
  pattern("batstr", "space", STRbatSpace, false, "", args(1,2, 
batarg("",str),batarg("l",int))),
  pattern("batstr", "space", STRbatSpace, false, "", args(1,3, 
batarg("",str),batarg("l",int),batarg("s",oid))),
- command("batstr", "asciify", BATSTRasciify, false, "Transform in str from 
UTF8 to ASCII", args(1, 2, batarg("out",str), batarg("in",str))),
+ command("batstr", "asciify", BATSTRasciify, false, "Transform BAT of strings 
from UTF8 to ASCII", args(1, 2, batarg("",str), batarg("b",str))),
+ command("batstr", "reverse", BATSTRreverse, false, "Reverse a BAT of 
strings", args(1, 2, batarg("",str), batarg("b",str))),
  { .imp=NULL }
 };
 #include "mal_import.h"
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

Reply via email to