> There seems to be an inconsistency here. Can (c_start >= c_len) be > relaxed to c_start > c_len?
Done. `substring' was a useful reference. > It would be nice to document if it's an open, closed or half- > open/closed range. E.g. see the documentation of 'substring': Done. > It seems a bit weird to support [start] and [end] for utf8->string but > not for utf16->string and utf32->string. I will gladly implement those too. I wanted feedback on the code before I implemented all of them, which you provided, thank you. I still want thoughts on the name of the new C function. If that is okay, then I can implement the rest following the same name pattern. >> + c_start = scm_to_size_t (start); > > This seems suboptimal because if start > SIZE_MAX, > then this will throw an 'out-of-range' exception without attributing > it to 'utf8->string' (untested). Switched to scm_to_unsigned_integer, that does bounds checks. This is what `substring' does. The updated patch is attached. ~ Vijay
>From 61b4b444eec1a8825d54604cbcb5a68bcfa9cef5 Mon Sep 17 00:00:00 2001 From: Vijay Marupudi <vi...@vijaymarupudi.com> Date: Thu, 20 Jan 2022 22:19:25 -0500 Subject: [PATCH] Enable utf8->string to take a range Additionally, adds a scm_utf8_to_string_range function for access from C. Behaves like substring. * doc/ref/api-data.texi: Updated documentation to reflect new function and range constraints * libguile/bytevectors.c: Added new function. * libguile/bytevectors.h: Added new function declaration. * test-suite/tests/bytevectors.test: Added tests for exceptions and behavior for edge cases --- doc/ref/api-data.texi | 8 +++- libguile/bytevectors.c | 66 ++++++++++++++++++++++++++----- libguile/bytevectors.h | 1 + test-suite/tests/bytevectors.test | 27 +++++++++++++ 4 files changed, 91 insertions(+), 11 deletions(-) diff --git a/doc/ref/api-data.texi b/doc/ref/api-data.texi index b6c2c4d61..0206435d3 100644 --- a/doc/ref/api-data.texi +++ b/doc/ref/api-data.texi @@ -7139,16 +7139,22 @@ UTF-32 (aka. UCS-4) encoding of @var{str}. For UTF-16 and UTF-32, it defaults to big endian. @end deffn -@deffn {Scheme Procedure} utf8->string utf +@deffn {Scheme Procedure} utf8->string utf [start [end]] @deffnx {Scheme Procedure} utf16->string utf [endianness] @deffnx {Scheme Procedure} utf32->string utf [endianness] @deffnx {C Function} scm_utf8_to_string (utf) +@deffnx {C Function} scm_utf8_to_string_range (utf, start, end) @deffnx {C Function} scm_utf16_to_string (utf, endianness) @deffnx {C Function} scm_utf32_to_string (utf, endianness) Return a newly allocated string that contains from the UTF-8-, UTF-16-, or UTF-32-decoded contents of bytevector @var{utf}. For UTF-16 and UTF-32, @var{endianness} should be the symbol @code{big} or @code{little}; when omitted, it defaults to big endian. + +@var{start} and @var{end}, when provided, must be exact integers +satisfying: + +0 <= @var{start} <= @var{end} <= @code{(bytevector-length @var{utf})}. @end deffn @node Bytevectors as Arrays diff --git a/libguile/bytevectors.c b/libguile/bytevectors.c index f42fbb427..3e128e667 100644 --- a/libguile/bytevectors.c +++ b/libguile/bytevectors.c @@ -2094,27 +2094,73 @@ SCM_DEFINE (scm_string_to_utf32, "string->utf32", return (str); -SCM_DEFINE (scm_utf8_to_string, "utf8->string", - 1, 0, 0, - (SCM utf), - "Return a newly allocate string that contains from the UTF-8-" - "encoded contents of bytevector @var{utf}.") -#define FUNC_NAME s_scm_utf8_to_string +static inline void +validate_bytevector_range(const char* function_name, size_t len, size_t start, size_t end) { + if (SCM_UNLIKELY (start > len)) + { + scm_out_of_range (function_name, scm_from_size_t(start)); + } + if (SCM_UNLIKELY (end > len)) + { + scm_out_of_range (function_name, scm_from_size_t(end)); + } + if (SCM_UNLIKELY(end < start)) + { + scm_out_of_range (function_name, scm_from_size_t(end)); + } +} + + +SCM_DEFINE (scm_utf8_to_string_range, "utf8->string", + 1, 2, 0, + (SCM utf, SCM start, SCM end), + "Return a newly allocate string that contains from the UTF-8-" + "encoded contents of bytevector @var{utf}.") +#define FUNC_NAME s_scm_utf8_to_string_range { SCM str; const char *c_utf; - size_t c_utf_len = 0; + size_t c_start; + size_t c_end; + size_t c_len; SCM_VALIDATE_BYTEVECTOR (1, utf); - - c_utf_len = SCM_BYTEVECTOR_LENGTH (utf); c_utf = (char *) SCM_BYTEVECTOR_CONTENTS (utf); - str = scm_from_utf8_stringn (c_utf, c_utf_len); + c_len = SCM_BYTEVECTOR_LENGTH(utf); + if (!scm_is_eq (start, SCM_UNDEFINED)) + { + c_start = scm_to_unsigned_integer (start, 0, c_len); + } + else + { + c_start = 0; + } + + if (!scm_is_eq (end, SCM_UNDEFINED)) + { + c_end = scm_to_unsigned_integer (end, 0, c_len); + } + else + { + c_end = c_len; + } + + validate_bytevector_range(FUNC_NAME, c_len, c_start, c_end); + str = scm_from_utf8_stringn (c_utf + c_start, c_end - c_start); return (str); } #undef FUNC_NAME +SCM +scm_utf8_to_string(SCM utf) +#define FUNC_NAME s_scm_utf8_to_string +{ + return scm_utf8_to_string_range(utf, SCM_UNDEFINED, SCM_UNDEFINED); +} +#undef FUNC_NAME + + SCM_DEFINE (scm_utf16_to_string, "utf16->string", 1, 1, 0, (SCM utf, SCM endianness), diff --git a/libguile/bytevectors.h b/libguile/bytevectors.h index 980d6e267..82a66ee5e 100644 --- a/libguile/bytevectors.h +++ b/libguile/bytevectors.h @@ -113,6 +113,7 @@ SCM_API SCM scm_string_to_utf8 (SCM); SCM_API SCM scm_string_to_utf16 (SCM, SCM); SCM_API SCM scm_string_to_utf32 (SCM, SCM); SCM_API SCM scm_utf8_to_string (SCM); +SCM_API SCM scm_utf8_to_string_range (SCM, SCM, SCM); SCM_API SCM scm_utf16_to_string (SCM, SCM); SCM_API SCM scm_utf32_to_string (SCM, SCM); diff --git a/test-suite/tests/bytevectors.test b/test-suite/tests/bytevectors.test index 732aadb3e..08719703a 100644 --- a/test-suite/tests/bytevectors.test +++ b/test-suite/tests/bytevectors.test @@ -558,6 +558,33 @@ exception:decoding-error (utf8->string #vu8(104 105 239 191 50))) + (pass-if "utf8->string range: start provided" + (let* ((utf8 (string->utf8 "gnu guile")) + (str (utf8->string utf8 4))) + (string=? str "guile"))) + + (pass-if "utf8->string range: start and end provided" + (let* ((utf8 (string->utf8 "gnu guile")) + (str (utf8->string utf8 4 7))) + (string=? str "gui"))) + + (pass-if "utf8->string range: start = end = 0" + (let* ((utf8 (string->utf8 "gnu guile")) + (str (utf8->string utf8 0 0))) + (string=? str ""))) + + (pass-if-exception "utf8->string range: start > len" + exception:out-of-range + (let* ((utf8 (string->utf8 "four"))) + ;; 4 as start is expected to return an empty string, in congruence + ;; with `substring'. + (utf8->string utf8 5))) + + (pass-if-exception "utf8->string range: end < start" + exception:out-of-range + (let* ((utf8 (string->utf8 "gnu guile"))) + (utf8->string utf8 1 0))) + (pass-if "utf16->string" (let* ((utf16 (uint-list->bytevector (map char->integer (string->list "hello, world")) -- 2.34.1