From: Paul Eggert <egg...@penguin.cs.ucla.edu> Fix mbrtowc so that it never returns -1 in the C locale, as this conflicts with a future version of POSIX http://austingroupbugs.net/view.php?id=663#c2738 and causes problems with GNU grep: http://bugs.gnu.org/23234 See glibc bug 19932: https://sourceware.org/bugzilla/show_bug.cgi?id=19932 * doc/posix-functions/mbrlen.texi (mbrlen): * doc/posix-functions/mbrtowc.texi (mbrtowc): Document the glibc bug. * lib/mbrtowc.c [C_LOCALE_MAYBE_EILSEQ]: Include hard-locale.h, locale.h. (rpl_mbrtowc): Work around the C_LOCALE_MAYBE_EILSEQ bug, if the bug is possible. * m4/mbrtowc.m4 (gl_MBRTOWC_C_LOCALE): New macro. (gl_FUNC_MBRTOWC): Use it, and define C_LOCALE_MAYBE_EILSEQ as needed. * modules/hard-locale (License): Now LGPLv2+, for mbrtowc. * modules/mbrtowc (Depends-on): Add hard-locale. * modules/mbrtowc-tests (Files, TESTS): Add tests/test-mbrtowc5.sh. * tests/test-mbrtowc.c (main): Test for bug fix if arg is '5'. * tests/test-mbrtowc5.sh: New file. --- ChangeLog | 24 ++++++++++++++++++ doc/posix-functions/mbrlen.texi | 4 +++ doc/posix-functions/mbrtowc.texi | 4 +++ lib/mbrtowc.c | 54 ++++++++++++++++++++-------------------- m4/mbrtowc.m4 | 50 ++++++++++++++++++++++++++++++++++++- modules/hard-locale | 2 +- modules/mbrtowc | 1 + modules/mbrtowc-tests | 3 ++- tests/test-mbrtowc.c | 11 +++++++- tests/test-mbrtowc5.sh | 6 +++++ 10 files changed, 128 insertions(+), 31 deletions(-) create mode 100755 tests/test-mbrtowc5.sh
diff --git a/ChangeLog b/ChangeLog index 980cfaa..77f1be9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +2016-04-09 Paul Eggert <egg...@penguin.cs.ucla.edu> + + mbrtowc: work around glibc bug#19932 + Fix mbrtowc so that it never returns -1 in the C locale, + as this conflicts with a future version of POSIX + http://austingroupbugs.net/view.php?id=663#c2738 + and causes problems with GNU grep: http://bugs.gnu.org/23234 + See glibc bug 19932: + https://sourceware.org/bugzilla/show_bug.cgi?id=19932 + * doc/posix-functions/mbrlen.texi (mbrlen): + * doc/posix-functions/mbrtowc.texi (mbrtowc): + Document the glibc bug. + * lib/mbrtowc.c [C_LOCALE_MAYBE_EILSEQ]: + Include hard-locale.h, locale.h. + (rpl_mbrtowc): Work around the C_LOCALE_MAYBE_EILSEQ bug, + if the bug is possible. + * m4/mbrtowc.m4 (gl_MBRTOWC_C_LOCALE): New macro. + (gl_FUNC_MBRTOWC): Use it, and define C_LOCALE_MAYBE_EILSEQ as needed. + * modules/hard-locale (License): Now LGPLv2+, for mbrtowc. + * modules/mbrtowc (Depends-on): Add hard-locale. + * modules/mbrtowc-tests (Files, TESTS): Add tests/test-mbrtowc5.sh. + * tests/test-mbrtowc.c (main): Test for bug fix if arg is '5'. + * tests/test-mbrtowc5.sh: New file. + 2016-04-03 Pedro Alves <pal...@redhat.com> stdint: detect good enough pre-C++11 stdint.h in C++ mode diff --git a/doc/posix-functions/mbrlen.texi b/doc/posix-functions/mbrlen.texi index 7db550e..3f1d472 100644 --- a/doc/posix-functions/mbrlen.texi +++ b/doc/posix-functions/mbrlen.texi @@ -12,6 +12,10 @@ Portability problems fixed by Gnulib: This function is missing on some platforms: Minix 3.1.8, HP-UX 11.00, IRIX 6.5, Solaris 2.6, mingw, Interix 3.5. @item +In the C or POSIX locales, this function can return @code{(size_t) -1} +and set @code{errno} to @code{EILSEQ}: +glibc 2.23. +@item This function returns 0 instead of @code{(size_t) -2} when the input is empty: glibc 2.19. diff --git a/doc/posix-functions/mbrtowc.texi b/doc/posix-functions/mbrtowc.texi index 7c7f5fd..ad5c671 100644 --- a/doc/posix-functions/mbrtowc.texi +++ b/doc/posix-functions/mbrtowc.texi @@ -12,6 +12,10 @@ Portability problems fixed by Gnulib: This function is missing on some platforms: Minix 3.1.8, HP-UX 11.00, IRIX 6.5, Solaris 2.6, mingw, Interix 3.5. @item +In the C or POSIX locales, this function can return @code{(size_t) -1} +and set @code{errno} to @code{EILSEQ}: +glibc 2.23. +@item This function returns 0 instead of @code{(size_t) -2} when the input is empty: glibc 2.19. diff --git a/lib/mbrtowc.c b/lib/mbrtowc.c index 864e006..cdd874b 100644 --- a/lib/mbrtowc.c +++ b/lib/mbrtowc.c @@ -20,6 +20,11 @@ /* Specification. */ #include <wchar.h> +#if C_LOCALE_MAYBE_EILSEQ +# include "hard-locale.h" +# include <locale.h> +#endif + #if GNULIB_defined_mbstate_t /* Implement mbrtowc() on top of mbtowc(). */ @@ -328,6 +333,9 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) size_t rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) { + size_t ret; + wchar_t wc; + # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG if (s == NULL) { @@ -342,6 +350,9 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) return (size_t) -2; # endif + if (! pwc) + pwc = &wc; + # if MBRTOWC_RETVAL_BUG { static mbstate_t internal_state; @@ -357,8 +368,7 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) size_t count = 0; for (; n > 0; s++, n--) { - wchar_t wc; - size_t ret = mbrtowc (&wc, s, 1, ps); + ret = mbrtowc (&wc, s, 1, ps); if (ret == (size_t)(-1)) return (size_t)(-1); @@ -366,8 +376,7 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) if (ret != (size_t)(-2)) { /* The multibyte character has been completed. */ - if (pwc != NULL) - *pwc = wc; + *pwc = wc; return (wc == 0 ? 0 : count); } } @@ -376,32 +385,23 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) } # endif -# if MBRTOWC_NUL_RETVAL_BUG - { - wchar_t wc; - size_t ret = mbrtowc (&wc, s, n, ps); + ret = mbrtowc (pwc, s, n, ps); - if (ret != (size_t)(-1) && ret != (size_t)(-2)) - { - if (pwc != NULL) - *pwc = wc; - if (wc == 0) - ret = 0; - } - return ret; - } -# else - { -# if MBRTOWC_NULL_ARG1_BUG - wchar_t dummy; - - if (pwc == NULL) - pwc = &dummy; -# endif +# if MBRTOWC_NUL_RETVAL_BUG + if (ret < (size_t) -2 && !*pwc) + return 0; +# endif - return mbrtowc (pwc, s, n, ps); - } +# if C_LOCALE_MAYBE_EILSEQ + if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE)) + { + unsigned char uc = *s; + *pwc = uc; + return 1; + } # endif + + return ret; } #endif diff --git a/m4/mbrtowc.m4 b/m4/mbrtowc.m4 index e8c7eeb..d370fcc 100644 --- a/m4/mbrtowc.m4 +++ b/m4/mbrtowc.m4 @@ -1,4 +1,4 @@ -# mbrtowc.m4 serial 26 -*- coding: utf-8 -*- +# mbrtowc.m4 serial 27 -*- coding: utf-8 -*- dnl Copyright (C) 2001-2002, 2004-2005, 2008-2016 Free Software Foundation, dnl Inc. dnl This file is free software; the Free Software Foundation @@ -40,6 +40,7 @@ AC_DEFUN([gl_FUNC_MBRTOWC], gl_MBRTOWC_RETVAL gl_MBRTOWC_NUL_RETVAL gl_MBRTOWC_EMPTY_INPUT + gl_MBRTOWC_C_LOCALE case "$gl_cv_func_mbrtowc_null_arg1" in *yes) ;; *) AC_DEFINE([MBRTOWC_NULL_ARG1_BUG], [1], @@ -76,6 +77,13 @@ AC_DEFUN([gl_FUNC_MBRTOWC], REPLACE_MBRTOWC=1 ;; esac + case $gl_cv_C_locale_sans_EILSEQ in + *yes) ;; + *) AC_DEFINE([C_LOCALE_MAYBE_EILSEQ], [1], + [Define to 1 if the C locale may have encoding errors.]) + REPLACE_MBRTOWC=1 + ;; + esac fi fi ]) @@ -577,6 +585,46 @@ changequote([,])dnl ]) ]) +dnl Test whether mbrtowc reports encoding errors in the C locale. +dnl Although POSIX was never intended to allow this, the GNU C Library +dnl and other implementations do it. See: +dnl https://sourceware.org/bugzilla/show_bug.cgi?id=19932 + +AC_DEFUN([gl_MBRTOWC_C_LOCALE], +[ + AC_CACHE_CHECK([whether the C locale is free of encoding errors], + [gl_cv_C_locale_sans_EILSEQ], + [ + dnl Initial guess, used when cross-compiling or when no suitable locale + dnl is present. + gl_cv_C_locale_sans_EILSEQ="guessing no" + + AC_RUN_IFELSE( + [AC_LANG_PROGRAM( + [[#include <limits.h> + #include <locale.h> + #include <wchar.h> + ]], [[ + int i; + char *locale = setlocale (LC_ALL, "C"); + if (! locale) + return 1; + for (i = CHAR_MIN; i <= CHAR_MAX; i++) + { + char c = i; + wchar_t wc; + mbstate_t mbs = { 0, }; + size_t ss = mbrtowc (&wc, &c, 1, &mbs); + if (1 < ss) + return 1; + } + return 0; + ]])], + [gl_cv_C_locale_sans_EILSEQ=yes], + [gl_cv_C_locale_sans_EILSEQ=no], + [:])]) +]) + # Prerequisites of lib/mbrtowc.c. AC_DEFUN([gl_PREREQ_MBRTOWC], [ : diff --git a/modules/hard-locale b/modules/hard-locale index 88dff8e..76c6edd 100644 --- a/modules/hard-locale +++ b/modules/hard-locale @@ -20,7 +20,7 @@ Include: "hard-locale.h" License: -GPL +LGPLv2+ Maintainer: Paul Eggert diff --git a/modules/mbrtowc b/modules/mbrtowc index 4e90b67..bd951ae 100644 --- a/modules/mbrtowc +++ b/modules/mbrtowc @@ -13,6 +13,7 @@ m4/codeset.m4 Depends-on: wchar extensions +hard-locale [test $HAVE_MBRTOWC = 0 || test $REPLACE_MBRTOWC = 1] mbsinit [test $HAVE_MBRTOWC = 0 || test $REPLACE_MBRTOWC = 1] localcharset [test $HAVE_MBRTOWC = 0 || test $REPLACE_MBRTOWC = 1] streq [test $HAVE_MBRTOWC = 0 || test $REPLACE_MBRTOWC = 1] diff --git a/modules/mbrtowc-tests b/modules/mbrtowc-tests index bbd2213..fe948c3 100644 --- a/modules/mbrtowc-tests +++ b/modules/mbrtowc-tests @@ -3,6 +3,7 @@ tests/test-mbrtowc1.sh tests/test-mbrtowc2.sh tests/test-mbrtowc3.sh tests/test-mbrtowc4.sh +tests/test-mbrtowc5.sh tests/test-mbrtowc.c tests/test-mbrtowc-w32-1.sh tests/test-mbrtowc-w32-2.sh @@ -31,6 +32,7 @@ gt_LOCALE_ZH_CN Makefile.am: TESTS += \ test-mbrtowc1.sh test-mbrtowc2.sh test-mbrtowc3.sh test-mbrtowc4.sh \ + test-mbrtowc5.sh \ test-mbrtowc-w32-1.sh test-mbrtowc-w32-2.sh test-mbrtowc-w32-3.sh \ test-mbrtowc-w32-4.sh test-mbrtowc-w32-5.sh TESTS_ENVIRONMENT += \ @@ -39,4 +41,3 @@ TESTS_ENVIRONMENT += \ LOCALE_JA='@LOCALE_JA@' \ LOCALE_ZH_CN='@LOCALE_ZH_CN@' check_PROGRAMS += test-mbrtowc test-mbrtowc-w32 - diff --git a/tests/test-mbrtowc.c b/tests/test-mbrtowc.c index 831836e..f7fed6a 100644 --- a/tests/test-mbrtowc.c +++ b/tests/test-mbrtowc.c @@ -72,6 +72,10 @@ main (int argc, char *argv[]) for (c = 0; c < 0x100; c++) switch (c) { + default: + if (! (c && 1 < argc && argv[1][0] == '5')) + break; + /* Fall through. */ case '\t': case '\v': case '\f': case ' ': case '!': case '"': case '#': case '%': case '&': case '\'': case '(': case ')': case '*': @@ -93,7 +97,8 @@ main (int argc, char *argv[]) case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case '{': case '|': case '}': case '~': - /* c is in the ISO C "basic character set". */ + /* c is in the ISO C "basic character set", or argv[1] starts + with '5' so we are testing all nonnull bytes. */ buf[0] = c; wc = (wchar_t) 0xBADFACE; ret = mbrtowc (&wc, buf, 1, &state); @@ -334,6 +339,10 @@ main (int argc, char *argv[]) ASSERT (mbsinit (&state)); } return 0; + + case '5': + /* C locale; tested above. */ + return 0; } return 1; diff --git a/tests/test-mbrtowc5.sh b/tests/test-mbrtowc5.sh new file mode 100755 index 0000000..c10b228 --- /dev/null +++ b/tests/test-mbrtowc5.sh @@ -0,0 +1,6 @@ +#!/bin/sh +# Test whether the POSIX locale has encoding errors. +LC_ALL=C \ +./test-mbrtowc${EXEEXT} 5 || exit +LC_ALL=POSIX \ +./test-mbrtowc${EXEEXT} 5 -- 2.5.5