Changeset: 54df68af9352 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/54df68af9352 Modified Files: monetdb5/modules/atoms/str.c monetdb5/modules/kernel/batstr.c Branch: txtsim Log Message:
STRreverse and BATSTRreverse implementation. diffs (219 lines): diff --git a/monetdb5/modules/atoms/str.c b/monetdb5/modules/atoms/str.c --- a/monetdb5/modules/atoms/str.c +++ b/monetdb5/modules/atoms/str.c @@ -4785,6 +4785,66 @@ STRasciify(str *r, const str *s) #endif } +static str +STRreverse(str *ret, const str *arg) +{ + str src = *arg; + size_t len = strlen(src); + str dst = GDKmalloc(len + 1); + /* dst is a buffer of length larger than len (i.e. dst[len] exists), + src is a UTF-8-encoded string of length exactly len bytes. */ + if (dst == NULL) + throw(MAL, "str.reverse", MAL_MALLOC_FAIL); + dst[len] = 0; + if (strNil(src)) { + /* special case for nil:str */ + assert(len == strlen(str_nil)); + strcpy(dst, str_nil); + return MAL_SUCCEED; + } + /* All strings in MonetDB are encoded using UTF-8; we must + * make sure that the reversed string is also encoded in valid + * UTF-8, so we treat multibyte characters as single units */ + while (*src) { + if ((*src & 0xF8) == 0xF0) { + /* 4 byte UTF-8 sequence */ + assert(len >= 4); + dst[len - 4] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 3] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 2] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 1] = *src++; + len -= 4; + } else if ((*src & 0xF0) == 0xE0) { + /* 3 byte UTF-8 sequence */ + assert(len >= 3); + dst[len - 3] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 2] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 1] = *src++; + len -= 3; + } else if ((*src & 0xE0) == 0xC0) { + /* 2 byte UTF-8 sequence */ + assert(len >= 2); + dst[len - 2] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 1] = *src++; + len -= 2; + } else { + /* 1 byte UTF-8 "sequence" */ + assert(len >= 1); + assert((*src & 0x80) == 0); + dst[--len] = *src++; + } + } + assert(len == 0); + *ret = dst; + return MAL_SUCCEED; +} + #include "mel.h" mel_func str_init_funcs[] = { command("str", "str", STRtostr, false, "Noop routine.", args(1,2, arg("",str),arg("s",str))), @@ -4828,7 +4888,8 @@ mel_func str_init_funcs[] = { command("str", "repeat", STRrepeat, false, "", args(1,3, arg("",str),arg("s2",str),arg("c",int))), command("str", "space", STRspace, false, "", args(1,2, arg("",str),arg("l",int))), command("str", "epilogue", STRepilogue, false, "", args(1,1, arg("",void))), - command("str", "asciify", STRasciify, false, "Transform in str from UTF8 to ASCII", args(1, 2, arg("out",str), arg("in",str))), + command("str", "asciify", STRasciify, false, "Transform string from UTF8 to ASCII", args(1, 2, arg("out",str), arg("in",str))), + command("str", "reverse", STRreverse, false, "Reverse a string", args(1,2, arg("out",str),arg("in",str))), { .imp=NULL } }; #include "mal_import.h" diff --git a/monetdb5/modules/kernel/batstr.c b/monetdb5/modules/kernel/batstr.c --- a/monetdb5/modules/kernel/batstr.c +++ b/monetdb5/modules/kernel/batstr.c @@ -4934,7 +4934,7 @@ BATSTRasciify(bat *ret, bat *bid) throw(MAL, "batstr.asciify", GDK_EXCEPTION); } bi = bat_iterator(b); - for (start = 0, end = BATcount(b); start < end; start++) { + BATloop(b, start, end) { in = (str) BUNtail(bi, start); if (strNil(in)) { if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) { @@ -4991,6 +4991,116 @@ BATSTRasciify(bat *ret, bat *bid) #endif } +static str +BATSTRreverse(bat *ret, const bat *arg) +{ + BAT *b, *bn; + BATiter bi; + BUN start, end; + const char *src; + /* Allocate temporary space for reversed strings; + we grow this if we need more. */ + size_t len, dst_len = 1024; + int i = -1; + str dst, error[2] = { GDK_EXCEPTION, MAL_MALLOC_FAIL }; + /* Use zalloc to force valid UTF-8 */ + if ((dst = GDKzalloc(dst_len)) == NULL) + throw(MAL, "batstr.reverse", MAL_MALLOC_FAIL); + if ((b = BATdescriptor(*arg)) == NULL) { + GDKfree(dst); + throw(MAL, "batstr.reverse", RUNTIME_OBJECT_MISSING); + } + /* We should only get called for string BATs */ + assert(b->ttype == TYPE_str); + /* Allocate result BAT */ + bn = COLnew(b->hseqbase, TYPE_str, BATcount(b), TRANSIENT); + if(bn == NULL) { + BBPunfix(b->batCacheid); + GDKfree(dst); + throw(MAL, "batstr.reverse", MAL_MALLOC_FAIL); + } + /* Loop through BAT b; 'start' is index of the entry we're working + on, 'end' is used internally by BATloop to do the iterating */ + bi = bat_iterator(b); + BATloop(b, start, end) { + src = (const char *) BUNtail(bi, start); + if (strNil(src)) { + assert(len > strlen(src)); + strcpy(dst, str_nil); + } + else { + len = strlen(src); + /* make sure dst is large enough */ + if (len >= dst_len) { + dst_len = len + 1024; + if ((dst = GDKrealloc(dst, dst_len)) == NULL) { + i = 1; + goto bail; + } + } + /* All strings in MonetDB are encoded using UTF-8; we must + * make sure that the reversed string is also encoded in valid + * UTF-8, so we treat multibyte characters as single units */ + while (*src) { + if ((*src & 0xF8) == 0xF0) { + /* 4 byte UTF-8 sequence */ + assert(len >= 4); + dst[len - 4] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 3] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 2] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 1] = *src++; + len -= 4; + } else if ((*src & 0xF0) == 0xE0) { + /* 3 byte UTF-8 sequence */ + assert(len >= 3); + dst[len - 3] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 2] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 1] = *src++; + len -= 3; + } else if ((*src & 0xE0) == 0xC0) { + /* 2 byte UTF-8 sequence */ + assert(len >= 2); + dst[len - 2] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 1] = *src++; + len -= 2; + } else { + /* 1 byte UTF-8 "sequence" */ + assert(len >= 1); + assert((*src & 0x80) == 0); + dst[--len] = *src++; + } + } + assert(len == 0); + } + if (BUNappend(bn, dst, false) != GDK_SUCCEED) { + /* BUNappend can fail since it may have to grow memory + areas, especially true for string BATs */ + i = 0; + goto bail; + } + } + bat_iterator_end(&bi); + GDKfree(dst); + BBPunfix(b->batCacheid); + *ret = bn->batCacheid; + BBPkeepref(bn); + return MAL_SUCCEED; + bail: + /* We only get here in the case of an allocation error; + clean up the mess we've created and throw an exception */ + bat_iterator_end(&bi); + GDKfree(dst); + BBPunfix(b->batCacheid); + BBPunfix(bn->batCacheid); + throw(MAL, "batstr.reverse", "%s", error[i]); +} + #include "mel.h" mel_func batstr_init_funcs[] = { pattern("batstr", "length", STRbatLength, false, "Return the length of a string.", args(1,2, batarg("",int),batarg("s",str))), @@ -5143,7 +5253,8 @@ mel_func batstr_init_funcs[] = { pattern("batstr", "repeat", STRbatrepeat_strcst, false, "", args(1,4, batarg("",str),arg("s",str),batarg("c",int),batarg("s",oid))), pattern("batstr", "space", STRbatSpace, false, "", args(1,2, batarg("",str),batarg("l",int))), pattern("batstr", "space", STRbatSpace, false, "", args(1,3, batarg("",str),batarg("l",int),batarg("s",oid))), - command("batstr", "asciify", BATSTRasciify, false, "Transform in str from UTF8 to ASCII", args(1, 2, batarg("out",str), batarg("in",str))), + command("batstr", "asciify", BATSTRasciify, false, "Transform BAT of strings from UTF8 to ASCII", args(1, 2, batarg("",str), batarg("b",str))), + command("batstr", "reverse", BATSTRreverse, false, "Reverse a BAT of strings", args(1, 2, batarg("",str), batarg("b",str))), { .imp=NULL } }; #include "mal_import.h" _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org