Dear Guile maintainers,
I want to report various problems with locale-dependent number
parsing in Guile version 3.0.8 (other versions, e. g. 2.2.7, have these
issues, too). Furthermore I want to propose a patch resolving these problems.
First consider the problems with function
‘locale-string->integer’:
------------------------------------------------------------
(use-modules (ice-9 i18n))
(substring "12" 0 1)
⇒ "1"
(locale-string->integer "1"
10
(make-locale LC_ALL "C"))
⇒ 1
⇒ 1
(locale-string->integer (substring "12" 0 1)
10
(make-locale LC_ALL "C"))
⇒ 12 ; expected 1
⇒ 2 ; expected 1
------------------------------------------------------------
This problem is caused by the erroneous handling of substrings
in function ‘locale-string->integer’.
Moreover ‘locale-string->integer’ throws an exception of
"Invalid read access of chars of wide string" when called with
a wide string as its first argument.
An especially weird example is:
------------------------------------------------------------
(use-modules (ice-9 i18n))
(substring "1\u0100" 0 1)
⇒ "1"
(locale-string->integer "1" 10 (make-locale LC_ALL "C"))
⇒ 1
⇒ 1
(locale-string->integer (substring "1\u0100" 0 1)
10
(make-locale LC_ALL "C"))
⊣ ice-9/boot-9.scm:1685:16: In procedure raise-exception:
Invalid read access of chars of wide string: "1"
; expected values 1 and 1
------------------------------------------------------------
The function ‘locale-string->inexact’ has similar problems:
------------------------------------------------------------
(use-modules (ice-9 i18n))
(substring "0.5625" 0 3)
⇒ "0.5"
(locale-string->inexact "0.5"
(make-locale LC_ALL "C"))
⇒ 0.5
⇒ 3
(locale-string->inexact (substring "0.5625" 0 3)
(make-locale LC_ALL "C"))
⇒ 0.5625 ; expected 0.5
⇒ 6 ; expected 3
------------------------------------------------------------
This problem is caused by the erroneous handling of substrings
in function ‘locale-string->inexact’.
Moreover ‘locale-string->inexact’ throws an exception of
"Invalid read access of chars of wide string" when called
with a wide string as its first argument.
An especially weird example is:
------------------------------------------------------------
(use-modules (ice-9 i18n))
(substring "1.25\u0100" 0 4)
⇒ "1.25"
(locale-string->inexact "1.25" (make-locale LC_ALL "C"))
⇒ 1.25
⇒ 4
(locale-string->inexact (substring "1.25\u0100" 0 4)
(make-locale LC_ALL "C"))
⊣ ice-9/boot-9.scm:1685:16: In procedure raise-exception:
Invalid read access of chars of wide string: "1.25"
; expected values 1.25 and 4
------------------------------------------------------------
A proposal for a patch (based on Guile 3.0.8) resolving these
issues and accompanying tests is attached to this message. In
function ‘scm_locale_string_to_integer’ a check that the parameter ‘base’
(if provided) is 0 or an integer between
2 and 36 has been added, as this is required by the functions ‘strtol’ resp.
‘wcstol’.
No assumption about the relationship between the types ‘scm_t_wchar’ and
‘wchar_t’ has been made for the sake of portability. The proposal is a bit
long -- please feel free
to pick what you see fit.
Best regards,
Andreas Ettner
From: Andreas Ettner <andreas.ett...@freenet.de>
Date: Mon, 9 May 2022 10:05:26 +0200
Subject: [PATCH] Improve internationalization
---
libguile/i18n.c | 257 ++++++++++++++++++++++++++++++++-----
test-suite/tests/i18n.test | 73 +++++++++--
2 files changed, 288 insertions(+), 42 deletions(-)
diff --git a/libguile/i18n.c b/libguile/i18n.c
index 52a8080..609eb0b 100644
--- a/libguile/i18n.c
+++ b/libguile/i18n.c
@@ -26,20 +26,27 @@
#include <string.h> /* `strcoll ()' */
#include <ctype.h> /* `toupper ()' et al. */
#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
#include <unicase.h>
#include <unistr.h>
+#include <wchar.h>
#include "boolean.h"
#include "chars.h"
#include "dynwind.h"
+#include "error.h"
#include "extensions.h"
#include "feature.h"
+#include "gc.h"
#include "gsubr.h"
#include "list.h"
#include "modules.h"
#include "numbers.h"
#include "pairs.h"
#include "posix.h" /* for `scm_i_locale_mutex' */
+#include "scm.h"
#include "smob.h"
#include "strings.h"
#include "symbols.h"
@@ -1364,6 +1371,84 @@ SCM_DEFINE (scm_string_locale_titlecase,
"string-locale-titlecase", 1, 1, 0,
/* Locale-dependent number parsing. */
+/* Yield 1 if the value of the expression A is less or equal to the
+ value of expression B in the mathematical sense, otherwise yield
+ 0. The argument expressions may be evaluated multiple times.
+
+ Mind you: `1U <= -1' evaluates to 1 in C. */
+#define SCM_LEQ(a, b) \
+ (((a) < 0) ? \
+ (((b) < 0) ? \
+ ((a) <= (b)) : \
+ 1) : \
+ (((b) < 0) ? \
+ 0 : \
+ ((a) <= (b))))
+
+/* Copy SIZE elements of array FROM into array TO replacing all not
+ representable characters with null. Null characters are passed
+ through. The arrays TO and FROM must both have a length of at
+ least SIZE and must not overlap. */
+static void
+scm_t_wchar_to_wchar_t_array (wchar_t to[],
+ const scm_t_wchar from[],
+ size_t size)
+{
+ size_t i;
+
+ for (i = 0; i < size; i++)
+ {
+#if (SCM_LEQ (WCHAR_MIN, INT32_MIN) && SCM_LEQ (INT32_MAX, WCHAR_MAX))
+ /* Optimization: Since `scm_t_wchar' is a signed, 32-bit integer
+ type (according to section [API Reference | Data Types |
+ Characters] in the Guile Reference), FROM[I] is in this
+ case clearly presentable in `wchar_t'. */
+ to[i] = (wchar_t) from[i];
+#else
+ scm_t_wchar ch = from[i];
+ to[i] = ((SCM_LEQ (WCHAR_MIN, ch) && SCM_LEQ (ch, WCHAR_MAX)) ?
+ ((wchar_t) ch) :
+ L'\0');
+#endif
+ }
+}
+
+#define SCM_NARROW_STRING_TO_C(str, c_str, c_str_malloc_p) \
+ do \
+ { \
+ size_t len, bytes; \
+ \
+ len = scm_i_string_length (str); \
+ if (!(len < SIZE_MAX)) \
+ scm_num_overflow ("SCM_NARROW_STRING_TO_C"); \
+ \
+ bytes = len + ((size_t) 1); \
+ c_str_malloc_p = (bytes > SCM_MAX_ALLOCA); \
+ c_str = (c_str_malloc_p ? scm_malloc (bytes) : alloca (bytes)); \
+ \
+ memcpy (c_str, scm_i_string_chars (str), len); \
+ c_str[len] = '\0'; \
+ } while (0)
+
+#define SCM_WIDE_STRING_TO_C(str, c_str, c_str_malloc_p) \
+ do \
+ { \
+ size_t len, bytes; \
+ \
+ len = scm_i_string_length (str); \
+ if (!(len < (SIZE_MAX / sizeof (wchar_t)))) \
+ scm_num_overflow ("SCM_WIDE_STRING_TO_C"); \
+ \
+ bytes = (len + ((size_t) 1)) * sizeof (wchar_t); \
+ c_str_malloc_p = (bytes > SCM_MAX_ALLOCA); \
+ c_str = (c_str_malloc_p ? scm_malloc (bytes) : alloca (bytes)); \
+ \
+ scm_t_wchar_to_wchar_t_array (c_str, \
+ scm_i_string_wide_chars (str), \
+ len); \
+ c_str[len] = L'\0'; \
+ } while (0)
+
SCM_DEFINE (scm_locale_string_to_integer, "locale-string->integer",
1, 2, 0, (SCM str, SCM base, SCM locale),
"Convert string @var{str} into an integer according to either "
@@ -1374,42 +1459,99 @@ SCM_DEFINE (scm_locale_string_to_integer,
"locale-string->integer",
#define FUNC_NAME s_scm_locale_string_to_integer
{
SCM result;
- long c_result;
+ SCM char_count;
int c_base;
- const char *c_str;
- char *c_endptr;
scm_t_locale c_locale;
SCM_VALIDATE_STRING (1, str);
- c_str = scm_i_string_chars (str);
if (!scm_is_eq (base, SCM_UNDEFINED))
- SCM_VALIDATE_INT_COPY (2, base, c_base);
+ {
+ SCM_VALIDATE_INT_COPY (2, base, c_base);
+ if (!(c_base == 0 || (2 <= c_base && c_base <= 36)))
+ scm_out_of_range (FUNC_NAME, base);
+ }
else
c_base = 10;
SCM_VALIDATE_OPTIONAL_LOCALE_COPY (3, locale, c_locale);
- if (c_locale != NULL)
+ if (scm_i_is_narrow_string (str))
{
+ long c_result;
+ char *c_str;
+ int c_str_malloc_p;
+ char *c_endptr;
+
+ scm_dynwind_begin (0);
+ {
+ SCM_NARROW_STRING_TO_C (str, c_str, c_str_malloc_p);
+ if (c_str_malloc_p)
+ scm_dynwind_free (c_str);
+
+ if (c_locale != NULL)
+ {
#if defined USE_GNU_LOCALE_API && defined HAVE_STRTOL_L
- c_result = strtol_l (c_str, &c_endptr, c_base, c_locale);
+ c_result = strtol_l (c_str, &c_endptr, c_base, c_locale);
#else
- RUN_IN_LOCALE_SECTION (c_locale,
- c_result = strtol (c_str, &c_endptr, c_base));
+ RUN_IN_LOCALE_SECTION (c_locale,
+ c_result = strtol (c_str, &c_endptr,
c_base));
#endif
+ }
+ else
+ c_result = strtol (c_str, &c_endptr, c_base);
+
+ if (c_endptr == c_str)
+ {
+ result = SCM_BOOL_F;
+ char_count = scm_from_int (0);
+ }
+ else
+ {
+ result = scm_from_long (c_result);
+ char_count = scm_from_ptrdiff_t (c_endptr - c_str);
+ }
+ }
+ scm_dynwind_end ();
}
else
- c_result = strtol (c_str, &c_endptr, c_base);
+ {
+ long c_result;
+ wchar_t *c_str;
+ int c_str_malloc_p;
+ wchar_t *c_endptr;
- scm_remember_upto_here (str);
+ scm_dynwind_begin (0);
+ {
+ SCM_WIDE_STRING_TO_C (str, c_str, c_str_malloc_p);
+ if (c_str_malloc_p)
+ scm_dynwind_free (c_str);
+
+ if (c_locale != NULL)
+ {
+ RUN_IN_LOCALE_SECTION (c_locale,
+ c_result = wcstol (c_str, &c_endptr,
c_base));
+ }
+ else
+ c_result = wcstol (c_str, &c_endptr, c_base);
+
+ if (c_endptr == c_str)
+ {
+ result = SCM_BOOL_F;
+ char_count = scm_from_int (0);
+ }
+ else
+ {
+ result = scm_from_long (c_result);
+ char_count = scm_from_ptrdiff_t (c_endptr - c_str);
+ }
+ }
+ scm_dynwind_end ();
+ }
- if (c_endptr == c_str)
- result = SCM_BOOL_F;
- else
- result = scm_from_long (c_result);
+ scm_remember_upto_here_2 (str, locale);
- return scm_values_2 (result, scm_from_long (c_endptr - c_str));
+ return scm_values_2 (result, char_count);
}
#undef FUNC_NAME
@@ -1424,36 +1566,89 @@ SCM_DEFINE (scm_locale_string_to_inexact,
"locale-string->inexact",
#define FUNC_NAME s_scm_locale_string_to_inexact
{
SCM result;
- double c_result;
- const char *c_str;
- char *c_endptr;
+ SCM char_count;
scm_t_locale c_locale;
SCM_VALIDATE_STRING (1, str);
- c_str = scm_i_string_chars (str);
SCM_VALIDATE_OPTIONAL_LOCALE_COPY (2, locale, c_locale);
- if (c_locale != NULL)
+ if (scm_i_is_narrow_string (str))
{
+ double c_result;
+ char *c_str;
+ int c_str_malloc_p;
+ char *c_endptr;
+
+ scm_dynwind_begin (0);
+ {
+ SCM_NARROW_STRING_TO_C (str, c_str, c_str_malloc_p);
+ if (c_str_malloc_p)
+ scm_dynwind_free (c_str);
+
+ if (c_locale != NULL)
+ {
#if defined USE_GNU_LOCALE_API && defined HAVE_STRTOD_L
- c_result = strtod_l (c_str, &c_endptr, c_locale);
+ c_result = strtod_l (c_str, &c_endptr, c_locale);
#else
- RUN_IN_LOCALE_SECTION (c_locale,
- c_result = strtod (c_str, &c_endptr));
+ RUN_IN_LOCALE_SECTION (c_locale,
+ c_result = strtod (c_str, &c_endptr));
#endif
+ }
+ else
+ c_result = strtod (c_str, &c_endptr);
+
+ if (c_endptr == c_str)
+ {
+ result = SCM_BOOL_F;
+ char_count = scm_from_int (0);
+ }
+ else
+ {
+ result = scm_from_double (c_result);
+ char_count = scm_from_ptrdiff_t (c_endptr - c_str);
+ }
+ }
+ scm_dynwind_end ();
}
else
- c_result = strtod (c_str, &c_endptr);
+ {
+ double c_result;
+ wchar_t *c_str;
+ int c_str_malloc_p;
+ wchar_t *c_endptr;
- scm_remember_upto_here (str);
+ scm_dynwind_begin (0);
+ {
+ SCM_WIDE_STRING_TO_C (str, c_str, c_str_malloc_p);
+ if (c_str_malloc_p)
+ scm_dynwind_free (c_str);
+
+ if (c_locale != NULL)
+ {
+ RUN_IN_LOCALE_SECTION (c_locale,
+ c_result = wcstod (c_str, &c_endptr));
+ }
+ else
+ c_result = wcstod (c_str, &c_endptr);
+
+ if (c_endptr == c_str)
+ {
+ result = SCM_BOOL_F;
+ char_count = scm_from_int (0);
+ }
+ else
+ {
+ result = scm_from_double (c_result);
+ char_count = scm_from_ptrdiff_t (c_endptr - c_str);
+ }
+ }
+ scm_dynwind_end ();
+ }
- if (c_endptr == c_str)
- result = SCM_BOOL_F;
- else
- result = scm_from_double (c_result);
+ scm_remember_upto_here_2 (str, locale);
- return scm_values_2 (result, scm_from_long (c_endptr - c_str));
+ return scm_values_2 (result, char_count);
}
#undef FUNC_NAME
diff --git a/test-suite/tests/i18n.test b/test-suite/tests/i18n.test
index 83b53d0..ec295c1 100644
--- a/test-suite/tests/i18n.test
+++ b/test-suite/tests/i18n.test
@@ -412,19 +412,70 @@
(with-test-prefix "number parsing"
(pass-if "locale-string->integer"
- (call-with-values (lambda () (locale-string->integer "123"))
- (lambda (result char-count)
- (and (equal? result 123)
- (equal? char-count 3)))))
+ (and (call-with-values
+ (lambda () (locale-string->integer "123"))
+ (lambda (result char-count)
+ (and (equal? result 123)
+ (equal? char-count 3))))
+ (call-with-values
+ (lambda () (locale-string->integer (substring "12" 0 1)
+ 10
+ (make-locale LC_ALL "C")))
+ (lambda (result char-count)
+ (and (equal? result 1)
+ (equal? char-count 1))))
+ (call-with-values
+ (lambda () (locale-string->integer (substring "1\u0100" 0 1)
+ 10
+ (make-locale LC_ALL "C")))
+ (lambda (result char-count)
+ (and (equal? result 1)
+ (equal? char-count 1))))))
+
+ (pass-if "locale-string->integer (American English)"
+ (under-american-english-locale-or-unresolved
+ (lambda ()
+ (call-with-values
+ (lambda () (locale-string->integer (substring "\u20021" 0 2)
+ 10
+ %american-english-locale))
+ (lambda (result char-count)
+ (and (equal? result 1)
+ (equal? char-count 2)))))))
(pass-if "locale-string->inexact"
- (call-with-values
- (lambda ()
- (locale-string->inexact "123.456"
- (make-locale (list LC_NUMERIC) "C")))
- (lambda (result char-count)
- (and (equal? result 123.456)
- (equal? char-count 7)))))
+ (and (call-with-values
+ (lambda ()
+ (locale-string->inexact "123.456"
+ (make-locale (list LC_NUMERIC) "C")))
+ (lambda (result char-count)
+ (and (equal? result 123.456)
+ (equal? char-count 7))))
+ (call-with-values
+ (lambda ()
+ (locale-string->inexact (substring "0.5625" 0 3)
+ (make-locale LC_ALL "C")))
+ (lambda (result char-count)
+ (and (equal? result 0.5)
+ (equal? char-count 3))))
+ (call-with-values
+ (lambda ()
+ (locale-string->inexact (substring "1.25\u0100" 0 4)
+ (make-locale LC_ALL "C")))
+ (lambda (result char-count)
+ (and (equal? result 1.25)
+ (equal? char-count 4))))))
+
+ (pass-if "locale-string->inexact (American English)"
+ (under-american-english-locale-or-unresolved
+ (lambda ()
+ (call-with-values
+ (lambda ()
+ (locale-string->inexact (substring "\u20021.25" 0 5)
+ %american-english-locale))
+ (lambda (result char-count)
+ (and (equal? result 1.25)
+ (equal? char-count 5)))))))
(pass-if "locale-string->inexact (French)"
(under-french-locale-or-unresolved
--
2.35.1