Re: [PATCH] Enable utf8->string to take a range

Vijay Marupudi Fri, 21 Jan 2022 12:21:25 -0800

> There seems to be an inconsistency here.  Can (c_start >= c_len) be
> relaxed to c_start > c_len?


Done. `substring' was a useful reference.
 
> It would be nice to document if it's an open, closed or half-
> open/closed range.  E.g. see the documentation of 'substring':

Done.

> It seems a bit weird to support [start] and [end] for utf8->string but
> not for utf16->string and utf32->string.

I will gladly implement those too. I wanted feedback on the code before
I implemented all of them, which you provided, thank you. I still want
thoughts on the name of the new C function. If that is okay, then I can
implement the rest following the same name pattern.

>> +      c_start = scm_to_size_t (start);
>
> This seems suboptimal because if start > SIZE_MAX,
> then this will throw an 'out-of-range' exception without attributing
> it to 'utf8->string' (untested).

Switched to scm_to_unsigned_integer, that does bounds checks. This is
what `substring' does.

The updated patch is attached.

~ Vijay

>From 61b4b444eec1a8825d54604cbcb5a68bcfa9cef5 Mon Sep 17 00:00:00 2001
From: Vijay Marupudi <[email protected]>
Date: Thu, 20 Jan 2022 22:19:25 -0500
Subject: [PATCH] Enable utf8->string to take a range

Additionally, adds a scm_utf8_to_string_range function for access from
C. Behaves like substring.

* doc/ref/api-data.texi: Updated documentation to reflect new function
  and range constraints
* libguile/bytevectors.c: Added new function.
* libguile/bytevectors.h: Added new function declaration.
* test-suite/tests/bytevectors.test: Added tests for exceptions and
  behavior for edge cases
---
 doc/ref/api-data.texi             |  8 +++-
 libguile/bytevectors.c            | 66 ++++++++++++++++++++++++++-----
 libguile/bytevectors.h            |  1 +
 test-suite/tests/bytevectors.test | 27 +++++++++++++
 4 files changed, 91 insertions(+), 11 deletions(-)

diff --git a/doc/ref/api-data.texi b/doc/ref/api-data.texi
index b6c2c4d61..0206435d3 100644
--- a/doc/ref/api-data.texi
+++ b/doc/ref/api-data.texi
@@ -7139,16 +7139,22 @@ UTF-32 (aka. UCS-4) encoding of @var{str}.  For UTF-16 and UTF-32,
 it defaults to big endian.
 @end deffn
 
-@deffn {Scheme Procedure} utf8->string utf
+@deffn {Scheme Procedure} utf8->string utf [start [end]]
 @deffnx {Scheme Procedure} utf16->string utf [endianness]
 @deffnx {Scheme Procedure} utf32->string utf [endianness]
 @deffnx {C Function} scm_utf8_to_string (utf)
+@deffnx {C Function} scm_utf8_to_string_range (utf, start, end)
 @deffnx {C Function} scm_utf16_to_string (utf, endianness)
 @deffnx {C Function} scm_utf32_to_string (utf, endianness)
 Return a newly allocated string that contains from the UTF-8-, UTF-16-,
 or UTF-32-decoded contents of bytevector @var{utf}.  For UTF-16 and UTF-32,
 @var{endianness} should be the symbol @code{big} or @code{little}; when omitted,
 it defaults to big endian.
+
+@var{start} and @var{end}, when provided, must be exact integers
+satisfying:
+
+0 <= @var{start} <= @var{end} <= @code{(bytevector-length @var{utf})}.
 @end deffn
 
 @node Bytevectors as Arrays
diff --git a/libguile/bytevectors.c b/libguile/bytevectors.c
index f42fbb427..3e128e667 100644
--- a/libguile/bytevectors.c
+++ b/libguile/bytevectors.c
@@ -2094,27 +2094,73 @@ SCM_DEFINE (scm_string_to_utf32, "string->utf32",
   return (str);
 
 
-SCM_DEFINE (scm_utf8_to_string, "utf8->string",
-	    1, 0, 0,
-	    (SCM utf),
-	    "Return a newly allocate string that contains from the UTF-8-"
-	    "encoded contents of bytevector @var{utf}.")
-#define FUNC_NAME s_scm_utf8_to_string
+static inline void
+validate_bytevector_range(const char* function_name, size_t len, size_t start, size_t end) {
+  if (SCM_UNLIKELY (start > len))
+    {
+      scm_out_of_range (function_name, scm_from_size_t(start));
+    }
+  if (SCM_UNLIKELY (end > len))
+    {
+      scm_out_of_range (function_name, scm_from_size_t(end));
+    }
+  if (SCM_UNLIKELY(end < start))
+    {
+      scm_out_of_range (function_name, scm_from_size_t(end));
+    }
+}
+
+
+SCM_DEFINE (scm_utf8_to_string_range, "utf8->string",
+            1, 2, 0,
+            (SCM utf, SCM start, SCM end),
+            "Return a newly allocate string that contains from the UTF-8-"
+            "encoded contents of bytevector @var{utf}.")
+#define FUNC_NAME s_scm_utf8_to_string_range
 {
   SCM str;
   const char *c_utf;
-  size_t c_utf_len = 0;
+  size_t c_start;
+  size_t c_end;
+  size_t c_len;
 
   SCM_VALIDATE_BYTEVECTOR (1, utf);
-
-  c_utf_len = SCM_BYTEVECTOR_LENGTH (utf);
   c_utf = (char *) SCM_BYTEVECTOR_CONTENTS (utf);
-  str = scm_from_utf8_stringn (c_utf, c_utf_len);
+  c_len = SCM_BYTEVECTOR_LENGTH(utf);
 
+  if (!scm_is_eq (start, SCM_UNDEFINED))
+    {
+      c_start = scm_to_unsigned_integer (start, 0, c_len);
+    }
+  else
+    {
+      c_start = 0;
+    }
+
+  if (!scm_is_eq (end, SCM_UNDEFINED))
+    {
+      c_end = scm_to_unsigned_integer (end, 0, c_len);
+    }
+  else
+    {
+      c_end = c_len;
+    }
+
+  validate_bytevector_range(FUNC_NAME, c_len, c_start, c_end);
+  str = scm_from_utf8_stringn (c_utf + c_start, c_end - c_start);
   return (str);
 }
 #undef FUNC_NAME
 
+SCM
+scm_utf8_to_string(SCM utf)
+#define FUNC_NAME s_scm_utf8_to_string
+{
+  return scm_utf8_to_string_range(utf, SCM_UNDEFINED, SCM_UNDEFINED);
+}
+#undef FUNC_NAME
+
+
 SCM_DEFINE (scm_utf16_to_string, "utf16->string",
 	    1, 1, 0,
 	    (SCM utf, SCM endianness),
diff --git a/libguile/bytevectors.h b/libguile/bytevectors.h
index 980d6e267..82a66ee5e 100644
--- a/libguile/bytevectors.h
+++ b/libguile/bytevectors.h
@@ -113,6 +113,7 @@ SCM_API SCM scm_string_to_utf8 (SCM);
 SCM_API SCM scm_string_to_utf16 (SCM, SCM);
 SCM_API SCM scm_string_to_utf32 (SCM, SCM);
 SCM_API SCM scm_utf8_to_string (SCM);
+SCM_API SCM scm_utf8_to_string_range (SCM, SCM, SCM);
 SCM_API SCM scm_utf16_to_string (SCM, SCM);
 SCM_API SCM scm_utf32_to_string (SCM, SCM);
 
diff --git a/test-suite/tests/bytevectors.test b/test-suite/tests/bytevectors.test
index 732aadb3e..08719703a 100644
--- a/test-suite/tests/bytevectors.test
+++ b/test-suite/tests/bytevectors.test
@@ -558,6 +558,33 @@
       exception:decoding-error
     (utf8->string #vu8(104 105 239 191 50)))
 
+  (pass-if "utf8->string range: start provided"
+    (let* ((utf8 (string->utf8 "gnu guile"))
+           (str (utf8->string utf8 4)))
+      (string=? str "guile")))
+
+  (pass-if "utf8->string range: start and end provided"
+    (let* ((utf8 (string->utf8 "gnu guile"))
+           (str (utf8->string utf8 4 7)))
+      (string=? str "gui")))
+
+  (pass-if "utf8->string range: start = end = 0"
+    (let* ((utf8 (string->utf8 "gnu guile"))
+           (str (utf8->string utf8 0 0)))
+      (string=? str "")))
+
+  (pass-if-exception "utf8->string range: start > len"
+      exception:out-of-range
+    (let* ((utf8 (string->utf8 "four")))
+      ;; 4 as start is expected to return an empty string, in congruence
+      ;; with `substring'.
+      (utf8->string utf8 5)))
+
+  (pass-if-exception "utf8->string range: end < start"
+      exception:out-of-range
+      (let* ((utf8 (string->utf8 "gnu guile")))
+        (utf8->string utf8 1 0)))
+
   (pass-if "utf16->string"
     (let* ((utf16  (uint-list->bytevector (map char->integer
                                                (string->list "hello, world"))
-- 
2.34.1

Re: [PATCH] Enable utf8->string to take a range

Reply via email to