New module 'unistr/u8-mb-prev-uc'. * lib/unistr.in.h (u8_mb_prev_uc): New declaration. (u8_mb_prev_uc_aux): New declaration. * lib/unistr/u8-mb-prev-uc.c: New file. * lib/unistr/u8-mb-prev-uc-aux.c: New file. * tests/test-u8-mb-prev-uc.c: New file. * modules/u8-mb-prev-uc: New file. * modules/u8-mb-prev-uc-tests: New file.
New module 'unistr/u16-mb-prev-uc'. * lib/unistr.in.h (u16_mb_prev_uc): New declaration. (u16_mb_prev_uc_aux): New declaration. * lib/unistr/u16-mb-prev-uc.c: New file. * lib/unistr/u16-mb-prev-uc-aux.c: New file. * tests/test-u16-mb-prev-uc.c: New file. * modules/u16-mb-prev-uc: New file. * modules/u16-mb-prev-uc-tests: New file. New module 'unistr/u32-mb-prev-uc'. * lib/unistr.in.h (u32_mb_prev_uc): New declaration. * lib/unistr/u32-mb-prev-uc.c: New file. * tests/test-u32-mb-prev-uc.c: New file. * modules/u32-mb-prev-uc: New file. * modules/u32-mb-prev-uc-tests: New file. --- v1->v2: Revised based on Bruno Haible's feedback. v2->v3: Rebase only. v3->v4: Changed the code to always be "safe". It looks to me like the "unsafe" version that I had written originally reflected a misunderstanding of how the gnulib option for that was supposed to work. ChangeLog | 27 ++++ lib/unistr.in.h | 71 ++++++++++ lib/unistr/u16-mb-prev-uc-aux.c | 52 +++++++ lib/unistr/u16-mb-prev-uc.c | 62 +++++++++ lib/unistr/u32-mb-prev-uc.c | 43 ++++++ lib/unistr/u8-mb-prev-uc-aux.c | 131 +++++++++++++++++ lib/unistr/u8-mb-prev-uc.c | 142 +++++++++++++++++++ modules/unistr/u16-mb-prev-uc | 28 ++++ modules/unistr/u16-mb-prev-uc-tests | 12 ++ modules/unistr/u32-mb-prev-uc | 27 ++++ modules/unistr/u32-mb-prev-uc-tests | 12 ++ modules/unistr/u8-mb-prev-uc | 28 ++++ modules/unistr/u8-mb-prev-uc-tests | 14 ++ tests/unistr/test-u16-mb-prev-uc.c | 89 ++++++++++++ tests/unistr/test-u32-mb-prev-uc.c | 89 ++++++++++++ tests/unistr/test-u8-mb-prev-uc.c | 270 ++++++++++++++++++++++++++++++++++++ 16 files changed, 1097 insertions(+) create mode 100644 lib/unistr/u16-mb-prev-uc-aux.c create mode 100644 lib/unistr/u16-mb-prev-uc.c create mode 100644 lib/unistr/u32-mb-prev-uc.c create mode 100644 lib/unistr/u8-mb-prev-uc-aux.c create mode 100644 lib/unistr/u8-mb-prev-uc.c create mode 100644 modules/unistr/u16-mb-prev-uc create mode 100644 modules/unistr/u16-mb-prev-uc-tests create mode 100644 modules/unistr/u32-mb-prev-uc create mode 100644 modules/unistr/u32-mb-prev-uc-tests create mode 100644 modules/unistr/u8-mb-prev-uc create mode 100644 modules/unistr/u8-mb-prev-uc-tests create mode 100644 tests/unistr/test-u16-mb-prev-uc.c create mode 100644 tests/unistr/test-u32-mb-prev-uc.c create mode 100644 tests/unistr/test-u8-mb-prev-uc.c diff --git a/ChangeLog b/ChangeLog index 2da7d9b..8c7ba46 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,30 @@ +2011-01-01 Ben Pfaff <b...@cs.stanford.edu> + + New module 'unistr/u8-mb-prev-uc'. + * lib/unistr.in.h (u8_mb_prev_uc): New declaration. + (u8_mb_prev_uc_aux): New declaration. + * lib/unistr/u8-mb-prev-uc.c: New file. + * lib/unistr/u8-mb-prev-uc-aux.c: New file. + * tests/test-u8-mb-prev-uc.c: New file. + * modules/u8-mb-prev-uc: New file. + * modules/u8-mb-prev-uc-tests: New file. + + New module 'unistr/u16-mb-prev-uc'. + * lib/unistr.in.h (u16_mb_prev_uc): New declaration. + (u16_mb_prev_uc_aux): New declaration. + * lib/unistr/u16-mb-prev-uc.c: New file. + * lib/unistr/u16-mb-prev-uc-aux.c: New file. + * tests/test-u16-mb-prev-uc.c: New file. + * modules/u16-mb-prev-uc: New file. + * modules/u16-mb-prev-uc-tests: New file. + + New module 'unistr/u32-mb-prev-uc'. + * lib/unistr.in.h (u32_mb_prev_uc): New declaration. + * lib/unistr/u32-mb-prev-uc.c: New file. + * tests/test-u32-mb-prev-uc.c: New file. + * modules/u32-mb-prev-uc: New file. + * modules/u32-mb-prev-uc-tests: New file. + 2014-09-05 Mathieu Anquetin <math...@anquetin.eu> Trivial change. diff --git a/lib/unistr.in.h b/lib/unistr.in.h index 73d2c23..41078cc 100644 --- a/lib/unistr.in.h +++ b/lib/unistr.in.h @@ -300,6 +300,77 @@ extern int u32_mbtoucr (ucs4_t *puc, const uint32_t *s, size_t n); #endif +/* Return the length (number of units) of the last character in S, putting + its 'ucs4_t' representation in *PUC. Upon failure, *PUC is set to 0xfffd, + and an appropriate number of units is returned. + The number of available units, N, must be > 0. */ + +#if GNULIB_UNISTR_U8_MB_PREV_UC || HAVE_LIBUNISTRING +# if !HAVE_INLINE +extern int + u8_mb_prev_uc (ucs4_t *puc, const uint8_t *s, size_t n); +# else +extern int + u8_mb_prev_uc_aux (ucs4_t *puc, const uint8_t *s, size_t n); +static inline int +u8_mb_prev_uc (ucs4_t *puc, const uint8_t *s, size_t n) +{ + uint8_t c = s[n - 1]; + + if (c < 0x80) + { + *puc = c; + return 1; + } + else + return u8_mb_prev_uc_aux (puc, s, n); +} +# endif +#endif + +#if GNULIB_UNISTR_U16_MB_PREV_UC || HAVE_LIBUNISTRING +# if !HAVE_INLINE +extern int + u16_mb_prev_uc (ucs4_t *puc, const uint16_t *s, size_t n); +# else +extern int + u16_mb_prev_uc_aux (ucs4_t *puc, const uint16_t *s, size_t n); +static inline int +u16_mb_prev_uc (ucs4_t *puc, const uint16_t *s, size_t n) +{ + uint16_t c = s[n - 1]; + + if (c < 0xd800 || c >= 0xe000) + { + *puc = c; + return 1; + } + else + return u16_mb_prev_uc_aux (puc, s, n); +} +# endif +#endif + +#if GNULIB_UNISTR_U32_MB_PREV_UC || HAVE_LIBUNISTRING +# if !HAVE_INLINE +extern int + u32_mb_prev_uc (ucs4_t *puc, const uint32_t *s, size_t n); +# else +static inline int +u32_mb_prev_uc (ucs4_t *puc, const uint32_t *s, size_t n _GL_UNUSED_PARAMETER) +{ + uint32_t c = s[n - 1]; + + if (c < 0xd800 || (c >= 0xe000 && c < 0x110000)) + *puc = c; + else + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} +# endif +#endif + /* Put the multibyte character represented by UC in S, returning its length. Return -1 upon failure, -2 if the number of available units, N, is too small. The latter case cannot occur if N >= 6/2/1, respectively. */ diff --git a/lib/unistr/u16-mb-prev-uc-aux.c b/lib/unistr/u16-mb-prev-uc-aux.c new file mode 100644 index 0000000..eeab787 --- /dev/null +++ b/lib/unistr/u16-mb-prev-uc-aux.c @@ -0,0 +1,52 @@ +/* Look at last character in UTF-16 string. + Copyright (C) 1999-2002, 2006-2007, 2009-2011 Free Software Foundation, Inc. + Written by Ben Pfaff <b...@cs.stanford.edu>, 2011, + based on code by Bruno Haible <br...@clisp.org>, 2001. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#if defined IN_LIBUNISTRING || HAVE_INLINE + +int +u16_mb_prev_uc_aux (ucs4_t *puc, const uint16_t *s, size_t n) +{ + uint16_t c = s[n - 1]; + + if (c >= 0xdc00) + { + if (n >= 2) + { + if (s[n - 2] >= 0xd800 && s[n - 2] < 0xdc00) + { + *puc = 0x10000 + ((s[n - 2] - 0xd800) << 10) + (c - 0xdc00); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + } + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} + +#endif diff --git a/lib/unistr/u16-mb-prev-uc.c b/lib/unistr/u16-mb-prev-uc.c new file mode 100644 index 0000000..3511666 --- /dev/null +++ b/lib/unistr/u16-mb-prev-uc.c @@ -0,0 +1,62 @@ +/* Look at last character in UTF-16 string. + Copyright (C) 1999-2002, 2006-2007, 2009-2011 Free Software Foundation, Inc. + Written by Ben Pfaff <b...@cs.stanford.edu>, 2011, + based on code by Bruno Haible <br...@clisp.org>, 2001. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#if defined IN_LIBUNISTRING +/* Tell unistr.h to declare u16_mb_prev_uc as 'extern', not 'static inline'. */ +# include "unistring-notinline.h" +#endif + +/* Specification. */ +#include "unistr.h" + +#if !HAVE_INLINE + +int +u16_mb_prev_uc (ucs4_t *puc, const uint16_t *s, size_t n) +{ + uint16_t c = s[n - 1]; + + if (c < 0xd800 || c >= 0xe000) + { + *puc = c; + return 1; + } + if (c >= 0xdc00) + { + if (n >= 2) + { + if (s[n - 2] >= 0xd800 && s[n - 2] < 0xdc00) + { + *puc = 0x10000 + ((s[n - 2] - 0xd800) << 10) + (c - 0xdc00); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + } + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} + +#endif diff --git a/lib/unistr/u32-mb-prev-uc.c b/lib/unistr/u32-mb-prev-uc.c new file mode 100644 index 0000000..398827b --- /dev/null +++ b/lib/unistr/u32-mb-prev-uc.c @@ -0,0 +1,43 @@ +/* Look at last character in UTF-32 string. + Copyright (C) 2002, 2006-2007, 2009-2011 Free Software Foundation, Inc. + Written by Bruno Haible <br...@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#if defined IN_LIBUNISTRING +/* Tell unistr.h to declare u32_mb_prev_uc as 'extern', not 'static inline'. */ +# include "unistring-notinline.h" +#endif + +/* Specification. */ +#include "unistr.h" + +#if !HAVE_INLINE + +int +u32_mb_prev_uc (ucs4_t *puc, const uint32_t *s, size_t n) +{ + uint32_t c = s[n - 1]; + + if (c < 0xd800 || (c >= 0xe000 && c < 0x110000)) + *puc = c; + else + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} + +#endif diff --git a/lib/unistr/u8-mb-prev-uc-aux.c b/lib/unistr/u8-mb-prev-uc-aux.c new file mode 100644 index 0000000..1af912d --- /dev/null +++ b/lib/unistr/u8-mb-prev-uc-aux.c @@ -0,0 +1,131 @@ +/* Look at last character in UCS-8 string. + Copyright (C) 2001-2002, 2006-2007, 2009-2011, 2014 Free Software Foundation, Inc. + Written by Ben Pfaff <b...@cs.stanford.edu>, 2010, + based on code by Bruno Haible <br...@clisp.org>, 2001. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#if defined IN_LIBUNISTRING || HAVE_INLINE + +int +u8_mb_prev_uc_aux (ucs4_t *puc, const uint8_t *s, size_t n) +{ + uint8_t c_1 = s[n - 1]; + + /* The #if 1'd blocks below are code that could be deleted if one decided to + build an unsafe variant of this function. */ + +#if 1 + if (c_1 <= 0xbf) +#endif + { + if (n >= 2) + { + uint8_t c_2 = s[n - 2]; + + if ((c_2 ^ 0x80) >= 0x40) + { +#if 1 + if (c_2 >= 0xc2 && c_2 < 0xe0) +#endif + { + *puc = ((unsigned int) (c_2 & 0x1f) << 6) + | (unsigned int) (c_1 ^ 0x80); + return 2; + } +#if 1 + if (c_2 >= 0xe0 && c_2 < 0xf8) + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return 2; + } +#endif + } + else if (n >= 3) + { + uint8_t c_3 = s[n - 3]; + + if ((c_3 ^ 0x80) >= 0x40) + { +#if 1 + if ((c_3 == 0xe0 && c_2 >= 0xa0) + || (c_3 >= 0xe1 && c_3 < 0xed) + || (c_3 == 0xed && c_2 < 0xa0) + || (c_3 >= 0xee && c_3 < 0xf0)) +#endif + { + *puc = ((unsigned int) (c_3 & 0x0f) << 12) + | (unsigned int) ((c_2 ^ 0x80) << 6) + | (unsigned int) (c_1 ^ 0x80); + return 3; + } +#if 1 + if (c_3 >= 0xe0 && c_3 < 0xf8) + { + /* 0xe0: overlong sequence. + 0xe1...0xec: not reached. + 0xed: UTF-16 surrogate. + 0xee...0xef: not reached. + 0xf0...0xf7: incomplete multibyte character. */ + *puc = 0xfffd; + return 3; + } +#endif + } + else if (n >= 4) + { + uint8_t c_4 = s[n - 4]; + + if ((c_4 ^ 0x80) >= 0x40) + { +#if 1 + if ((c_4 == 0xf0 && c_3 >= 0x90) + || (c_4 >= 0xf1 && c_4 < 0xf4) + || (c_4 == 0xf4 && c_3 < 0x90)) +#endif + { + *puc = (unsigned int) ((c_4 & 0x07) << 18) + | (unsigned int) ((c_3 ^ 0x80) << 12) + | (unsigned int) ((c_2 ^ 0x80) << 6) + | (unsigned int) (c_1 ^ 0x80); + return 4; + } +#if 1 + if (c_4 >= 0xf0 && c_4 < 0xf8) + { + /* 0xf0: overlong sequence. + 0xf1...0xf3: not reached. + 0xf4...0xf7: invalid code point above U+10FFFF */ + *puc = 0xfffd; + return 4; + } +#endif + } + } + } + } + } + + /* invalid or incomplete multibyte character */ + *puc = 0xfffd; + return 1; +} + +#endif diff --git a/lib/unistr/u8-mb-prev-uc.c b/lib/unistr/u8-mb-prev-uc.c new file mode 100644 index 0000000..86cbd73 --- /dev/null +++ b/lib/unistr/u8-mb-prev-uc.c @@ -0,0 +1,142 @@ +/* Look at last character in UTF-8 string. + Copyright (C) 2001-2002, 2006-2007, 2009-2011, 2014 Free Software Foundation, Inc. + Written by Ben Pfaff <b...@cs.stanford.edu>, 2010, + based on code by Bruno Haible <br...@clisp.org>, 2001. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#if defined IN_LIBUNISTRING +/* Tell unistr.h to declare u8_mb_prev_uc as 'extern', not 'static inline'. */ +# include "unistring-notinline.h" +#endif + +/* Specification. */ +#include "unistr.h" + +#if !HAVE_INLINE + +int +u8_mb_prev_uc (ucs4_t *puc, const uint8_t *s, size_t n) +{ + uint8_t c_1 = s[n - 1]; + + if (c_1 < 0x80) + { + *puc = c_1; + return 1; + } + + /* The #if 1'd blocks below are code that could be deleted if one decided to + build an unsafe variant of this function. */ + +#if 1 + if (c_1 <= 0xbf) +#endif + { + if (n >= 2) + { + uint8_t c_2 = s[n - 2]; + + if ((c_2 ^ 0x80) >= 0x40) + { +#if 1 + if (c_2 >= 0xc2 && c_2 < 0xe0) +#endif + { + *puc = ((unsigned int) (c_2 & 0x1f) << 6) + | (unsigned int) (c_1 ^ 0x80); + return 2; + } +#if 1 + if (c_2 >= 0xe0 && c_2 < 0xf8) + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return 2; + } +#endif + } + else if (n >= 3) + { + uint8_t c_3 = s[n - 3]; + + if ((c_3 ^ 0x80) >= 0x40) + { +#if 1 + if ((c_3 == 0xe0 && c_2 >= 0xa0) + || (c_3 >= 0xe1 && c_3 < 0xed) + || (c_3 == 0xed && c_2 < 0xa0) + || (c_3 >= 0xee && c_3 < 0xf0)) +#endif + { + *puc = ((unsigned int) (c_3 & 0x0f) << 12) + | (unsigned int) ((c_2 ^ 0x80) << 6) + | (unsigned int) (c_1 ^ 0x80); + return 3; + } +#if 1 + if (c_3 >= 0xe0 && c_3 < 0xf8) + { + /* 0xe0: overlong sequence. + 0xe1...0xec: not reached. + 0xed: UTF-16 surrogate. + 0xee...0xef: not reached. + 0xf0...0xf7: incomplete multibyte character. */ + *puc = 0xfffd; + return 3; + } +#endif + } + else if (n >= 4) + { + uint8_t c_4 = s[n - 4]; + + if ((c_4 ^ 0x80) >= 0x40) + { +#if 1 + if ((c_4 == 0xf0 && c_3 >= 0x90) + || (c_4 >= 0xf1 && c_4 < 0xf4) + || (c_4 == 0xf4 && c_3 < 0x90)) +#endif + { + *puc = (unsigned int) ((c_4 & 0x07) << 18) + | (unsigned int) ((c_3 ^ 0x80) << 12) + | (unsigned int) ((c_2 ^ 0x80) << 6) + | (unsigned int) (c_1 ^ 0x80); + return 4; + } +#if 1 + if (c_4 >= 0xf0 && c_4 < 0xf8) + { + /* 0xf0: overlong sequence. + 0xf1...0xf3: not reached. + 0xf4...0xf7: invalid code point above U+10FFFF */ + *puc = 0xfffd; + return 4; + } +#endif + } + } + } + } + } + + /* invalid or incomplete multibyte character */ + *puc = 0xfffd; + return 1; +} + +#endif diff --git a/modules/unistr/u16-mb-prev-uc b/modules/unistr/u16-mb-prev-uc new file mode 100644 index 0000000..508fc72 --- /dev/null +++ b/modules/unistr/u16-mb-prev-uc @@ -0,0 +1,28 @@ +Description: +Look at last character in UTF-16 string. + +Files: +lib/unistr/u16-mb-prev-uc.c +lib/unistr/u16-mb-prev-uc-aux.c + +Depends-on: +unistr/base + +configure.ac: +gl_MODULE_INDICATOR([unistr/u16-mb-prev-uc]) +gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u16-mb-prev-uc]) + +Makefile.am: +if LIBUNISTRING_COMPILE_UNISTR_U16_MB_PREV_UC +lib_SOURCES += unistr/u16-mb-prev-uc.c unistr/u16-mb-prev-uc-aux.c +endif + +Include: +"unistr.h" + +License: +LGPL + +Maintainer: +Bruno Haible, Ben Pfaff + diff --git a/modules/unistr/u16-mb-prev-uc-tests b/modules/unistr/u16-mb-prev-uc-tests new file mode 100644 index 0000000..a9f504f --- /dev/null +++ b/modules/unistr/u16-mb-prev-uc-tests @@ -0,0 +1,12 @@ +Files: +tests/unistr/test-u16-mb-prev-uc.c + +Depends-on: + +configure.ac: + +Makefile.am: +TESTS += test-u16-mb-prev-uc +check_PROGRAMS += test-u16-mb-prev-uc +test_u16_mb_prev_uc_SOURCES = unistr/test-u16-mb-prev-uc.c +test_u16_mb_prev_uc_LDADD = $(LDADD) $(LIBUNISTRING) diff --git a/modules/unistr/u32-mb-prev-uc b/modules/unistr/u32-mb-prev-uc new file mode 100644 index 0000000..ad7974a --- /dev/null +++ b/modules/unistr/u32-mb-prev-uc @@ -0,0 +1,27 @@ +Description: +Look at last character in UTF-32 string. + +Files: +lib/unistr/u32-mb-prev-uc.c + +Depends-on: +unistr/base + +configure.ac: +gl_MODULE_INDICATOR([unistr/u32-mb-prev-uc]) +gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u32-mb-prev-uc]) + +Makefile.am: +if LIBUNISTRING_COMPILE_UNISTR_U32_MB_PREV_UC +lib_SOURCES += unistr/u32-mb-prev-uc.c +endif + +Include: +"unistr.h" + +License: +LGPL + +Maintainer: +Bruno Haible, Ben Pfaff + diff --git a/modules/unistr/u32-mb-prev-uc-tests b/modules/unistr/u32-mb-prev-uc-tests new file mode 100644 index 0000000..e1e45c8 --- /dev/null +++ b/modules/unistr/u32-mb-prev-uc-tests @@ -0,0 +1,12 @@ +Files: +tests/unistr/test-u32-mb-prev-uc.c + +Depends-on: + +configure.ac: + +Makefile.am: +TESTS += test-u32-mb-prev-uc +check_PROGRAMS += test-u32-mb-prev-uc +test_u32_mb_prev_uc_SOURCES = unistr/test-u32-mb-prev-uc.c +test_u32_mb_prev_uc_LDADD = $(LDADD) $(LIBUNISTRING) diff --git a/modules/unistr/u8-mb-prev-uc b/modules/unistr/u8-mb-prev-uc new file mode 100644 index 0000000..2a12805 --- /dev/null +++ b/modules/unistr/u8-mb-prev-uc @@ -0,0 +1,28 @@ +Description: +Look at last character in UTF-8 string. + +Files: +lib/unistr/u8-mb-prev-uc.c +lib/unistr/u8-mb-prev-uc-aux.c + +Depends-on: +unistr/base + +configure.ac: +gl_MODULE_INDICATOR([unistr/u8-mb-prev-uc]) +gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u8-mb-prev-uc]) + +Makefile.am: +if LIBUNISTRING_COMPILE_UNISTR_U8_MB_PREV_UC +lib_SOURCES += unistr/u8-mb-prev-uc.c unistr/u8-mb-prev-uc-aux.c +endif + +Include: +"unistr.h" + +License: +LGPL + +Maintainer: +Bruno Haible, Ben Pfaff + diff --git a/modules/unistr/u8-mb-prev-uc-tests b/modules/unistr/u8-mb-prev-uc-tests new file mode 100644 index 0000000..66a593a --- /dev/null +++ b/modules/unistr/u8-mb-prev-uc-tests @@ -0,0 +1,14 @@ +Files: +tests/unistr/test-u8-mb-prev-uc.c +tests/macros.h + +Depends-on: +unistr/u8-mbtouc + +configure.ac: + +Makefile.am: +TESTS += test-u8-mb-prev-uc +check_PROGRAMS += test-u8-mb-prev-uc +test_u8_mb_prev_uc_SOURCES = unistr/test-u8-mb-prev-uc.c +test_u8_mb_prev_uc_LDADD = $(LDADD) $(LIBUNISTRING) diff --git a/tests/unistr/test-u16-mb-prev-uc.c b/tests/unistr/test-u16-mb-prev-uc.c new file mode 100644 index 0000000..7f85e98 --- /dev/null +++ b/tests/unistr/test-u16-mb-prev-uc.c @@ -0,0 +1,89 @@ +/* Test of u16_mb_prev_uc() function. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* Written by Ben Pfaff, 2011. */ + +#include <config.h> + +#include "unistr.h" + +#include <assert.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> + +static void +test_u16_mb_prev_uc (int expect_len, ucs4_t expect_uc, ...) +{ + uint16_t s[16]; + va_list args; + size_t n; + + ucs4_t uc; + int len; + + va_start (args, expect_uc); + n = 0; + for (;;) + { + int unit = va_arg (args, int); + if (unit == -1) + break; + else if (n >= sizeof s / sizeof *s) + abort (); + + s[n++] = unit; + } + va_end (args); + + len = u16_mb_prev_uc (&uc, s, n); + if (len != expect_len || uc != expect_uc) + { + size_t i; + + fprintf (stderr, "u16_mb_prev_uc returned length %d and U+%04x, " + "expected length %d and U+%04x:", + len, (unsigned int) uc, + expect_len, (unsigned int) expect_uc); + for (i = 0; i < n; i++) + fprintf (stderr, " %04x", s[i]); + putc ('\n', stderr); + fflush (stderr); + abort (); + } +} + +int +main (void) +{ + /* Valid single-unit sequences. */ + test_u16_mb_prev_uc (1, 'a', 'a', -1); + test_u16_mb_prev_uc (1, 0x3042, 0x3042, -1); + test_u16_mb_prev_uc (1, 'b', 'a', 'b', -1); + test_u16_mb_prev_uc (1, 'x', 0x3042, 'x', -1); + + /* Valid surrogate pairs. */ + test_u16_mb_prev_uc (2, 0x1f610, 0xd83d, 0xde10, -1); + test_u16_mb_prev_uc (2, 0x1f610, 'x', 0xd83d, 0xde10, -1); + + /* Invalid surrogate pairs. */ + test_u16_mb_prev_uc (1, 0xfffd, 0xd800, -1); + test_u16_mb_prev_uc (1, 0xfffd, 'a', 0xd800, -1); + test_u16_mb_prev_uc (1, 0xfffd, 0xdeff, -1); + test_u16_mb_prev_uc (1, 0xfffd, 'b', 0xdeff, -1); + + return 0; +} diff --git a/tests/unistr/test-u32-mb-prev-uc.c b/tests/unistr/test-u32-mb-prev-uc.c new file mode 100644 index 0000000..6666877 --- /dev/null +++ b/tests/unistr/test-u32-mb-prev-uc.c @@ -0,0 +1,89 @@ +/* Test of u32_mb_prev_uc() function. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* Written by Ben Pfaff, 2011. */ + +#include <config.h> + +#include "unistr.h" + +#include <assert.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> + +static void +test_u32_mb_prev_uc (int expect_len, ucs4_t expect_uc, ...) +{ + uint32_t s[16]; + va_list args; + size_t n; + + ucs4_t uc; + int len; + + va_start (args, expect_uc); + n = 0; + for (;;) + { + int unit = va_arg (args, int); + if (unit == -1) + break; + else if (n >= sizeof s / sizeof *s) + abort (); + + s[n++] = unit; + } + va_end (args); + + len = u32_mb_prev_uc (&uc, s, n); + if (len != expect_len || uc != expect_uc) + { + size_t i; + + fprintf (stderr, "u32_mb_prev_uc returned length %d and U+%04x, " + "expected length %d and U+%04x:", + len, (unsigned int) uc, + expect_len, (unsigned int) expect_uc); + for (i = 0; i < n; i++) + fprintf (stderr, " %04x", s[i]); + putc ('\n', stderr); + fflush (stderr); + abort (); + } +} + +int +main (void) +{ + /* Valid. */ + test_u32_mb_prev_uc (1, 'a', 'a', -1); + test_u32_mb_prev_uc (1, 0x3042, 0x3042, -1); + test_u32_mb_prev_uc (1, 'b', 'a', 'b', -1); + test_u32_mb_prev_uc (1, 'x', 0x3042, 'x', -1); + + /* Surrogate pairs are invalid in UTF-32. */ + test_u32_mb_prev_uc (1, 0xfffd, 0xd83d, 0xde10, -1); + test_u32_mb_prev_uc (1, 0xfffd, 'x', 0xd83d, 0xde10, -1); + + /* Malformed surrogate pairs are doubly invalid in UTF-32. */ + test_u32_mb_prev_uc (1, 0xfffd, 0xd800, -1); + test_u32_mb_prev_uc (1, 0xfffd, 'a', 0xd800, -1); + test_u32_mb_prev_uc (1, 0xfffd, 0xdeff, -1); + test_u32_mb_prev_uc (1, 0xfffd, 'b', 0xdeff, -1); + + return 0; +} diff --git a/tests/unistr/test-u8-mb-prev-uc.c b/tests/unistr/test-u8-mb-prev-uc.c new file mode 100644 index 0000000..59d9a3c --- /dev/null +++ b/tests/unistr/test-u8-mb-prev-uc.c @@ -0,0 +1,270 @@ +/* Test of u8_mb_prev_uc() function. + Copyright (C) 2010, 2011, 2014 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* Written by Ben Pfaff, 2010. */ + +#include <config.h> + +#include "unistr.h" + +#include <assert.h> + +#include "macros.h" + +struct uc + { + /* UTF-8 representation. */ + const uint8_t *s; + int n; + + /* Code point. */ + ucs4_t uc; + }; + +/* Print the N code points and their representations in UC on stderr, preceded + by TITLE. */ +static void +print_ucs (const char *title, const struct uc *uc, size_t n) +{ + fprintf (stderr, "%s:", title); + for (; n-- > 0; uc++) + { + size_t i; + + fprintf (stderr, " <"); + for (i = 0; i < uc->n; i++) + { + if (i > 0) + putc (' ', stderr); + fprintf (stderr, "%02x", (unsigned int) uc->s[i]); + } + fprintf (stderr, "> U+%04X", (unsigned int) uc->uc); + } + putc ('\n', stderr); +} + +/* Reverses the order of the N elements of UC. */ +static void +reverse_ucs (struct uc *uc, size_t n) +{ + size_t i; + + for (i = 0; i < n / 2; i++) + { + size_t j = n - (i + 1); + struct uc tmp = uc[i]; + uc[i] = uc[j]; + uc[j] = tmp; + } +} + +static bool +equal_ucs (const struct uc *a, size_t n_a, const struct uc *b, size_t n_b) +{ + if (n_a != n_b) + return false; + for (; n_a-- > 0; a++, b++) + if (a->n != b->n || a->s != b->s || a->uc != b->uc) + return false; + return true; +} + +/* Checks that the N units in S yield the same code points whether iterated + in the forward or reverse direction. */ +static void +check_bidirectionally (const uint8_t *s, int n) +{ + struct uc ucf[16]; + struct uc ucr[16]; + int n_ucf, n_ucr; + int used; + + assert (n <= SIZEOF (ucf)); + assert (n <= SIZEOF (ucr)); + + /* Translate units to code points forward. */ + used = 0; + n_ucf = 0; + while (used < n) + { + struct uc *uc = &ucf[n_ucf++]; + uc->s = &s[used]; + uc->n = u8_mbtouc (&uc->uc, uc->s, n - used); + ASSERT (uc->n >= 1); + ASSERT (uc->n <= n - used); + used += uc->n; + } + + /* Translate units to code points backward. */ + used = 0; + n_ucr = 0; + while (used < n) + { + struct uc *uc = &ucr[n_ucr++]; + uc->n = u8_mb_prev_uc (&uc->uc, s, n - used); + ASSERT (uc->n >= 1); + ASSERT (uc->n <= n - used); + used += uc->n; + uc->s = &s[n - used]; + } + reverse_ucs (ucr, n_ucr); + + /* Check that the results were the same. */ + if (!equal_ucs (ucf, n_ucf, ucr, n_ucr)) + { + fprintf (stderr, "%s:%d: forward and reverse differ\n", + __FILE__, __LINE__); + print_ucs ("forward", ucf, n_ucf); + print_ucs ("reverse", ucr, n_ucr); + fflush (stderr); + abort (); + } +} + +static void +do_exhaustive_test (const uint8_t *start, uint8_t *s, int n) +{ + /* The units to test. */ + static const uint8_t units[] = { + /* The smallest value in each class. (Any other member or members would + work as well). */ + 0x00, 0x80, 0x90, 0xa0, 0xc0, 0xc2, 0xe0, 0xe1, 0xed, 0xee, 0xf0, 0xf1, + 0xf4, 0xf5, + + /* The UTF-8 units that make up U+FFFD, since that is such a special value + for these routines. */ + 0xef, 0xbf, 0xbd + }; + int i; + + for (i = 0; i < SIZEOF (units); i++) + { + s[0] = units[i]; + if (n > 1) + do_exhaustive_test (start, s + 1, n - 1); + else + check_bidirectionally (start, (s + 1) - start); + } +} + +/* This test exhaustively compares how u8_mbtouc() and u8_mb_prev_uc() treat + all UTF-8 well-formed and ill-formed sequences that are MAX_LENGTH units or + shorter. To do so in a reasonable amount of time, it uses a trick: many + UTF-8 unit values are in classes whose members are all treated the same way. + Thus, it is only necessary to test one member of each class. */ +static void +exhaustive_test (int max_length) +{ + uint8_t s[16]; + int length; + + assert (max_length <= SIZEOF (s)); + for (length = 0; length <= max_length; length++) + do_exhaustive_test (s, s, length); +} + +static void +do_well_formed_test (const uint8_t *start, uint8_t *s, int n) +{ + if (n == 0) + { + check_bidirectionally (start, s - start); + return; + } + + /* Test single-byte characters. */ + s[0] = 0; + do_well_formed_test (start, s + 1, n - 1); + + s[0] = 0x41; + do_well_formed_test (start, s + 1, n - 1); + + /* Test 2-byte characters. */ + if (n >= 2) + { + s[0] = 0xc2; + s[1] = 0xb0; + do_well_formed_test (start, s + 2, n - 2); + } + + /* Test 3-byte characters. */ + if (n >= 3) + { + s[0] = 0xe0; + s[1] = 0xa0; + s[2] = 0xa5; + do_well_formed_test (start, s + 3, n - 3); + + s[0] = 0xe5; + s[1] = 0xbf; + s[2] = 0x81; + do_well_formed_test (start, s + 3, n - 3); + + s[0] = 0xed; + s[1] = 0x9f; + s[2] = 0x99; + do_well_formed_test (start, s + 3, n - 3); + } + + /* Test 4-byte characters. */ + if (n >= 4) + { + s[0] = 0xf0; + s[1] = 0x90; + s[2] = 0xbb; + s[3] = 0x80; + do_well_formed_test (start, s + 4, n - 4); + + s[0] = 0xf2; + s[1] = 0x80; + s[2] = 0xbf; + s[3] = 0x80; + do_well_formed_test (start, s + 4, n - 4); + + s[0] = 0xf4; + s[1] = 0x8f; + s[2] = 0x80; + s[3] = 0xbf; + do_well_formed_test (start, s + 4, n - 4); + } +} + +/* Checks iteration through all possible sets of UTF-8 sequence lengths with + no more than MAX_LENGTH units. */ +static void +well_formed_test (int max_length) +{ + uint8_t s[16]; + int length; + + assert (max_length <= SIZEOF (s)); + for (length = 0; length <= max_length; length++) + do_well_formed_test (s, s, length); +} + +int +main (void) +{ + /* Runtime increases exponentially with the argument: 4 runs in a fraction + of a second, 5 in a few seconds, 6 in half a minute. */ + exhaustive_test (5); + + /* Runtime increases exponentially but much more slowly than with + exhaustive_test(). */ + well_formed_test (10); + + return 0; +} -- 1.9.1