Dear Bruno, thank you for responding so quickly and for this addition!
Marc Am Sa., 1. Jan. 2022 um 19:55 Uhr schrieb Bruno Haible <br...@clisp.org>: > > Marc Nieper-Wißkirchen wrote on 2021-12-30: > > The striconveh module and related modules offer an error handler > > argument. The current possible values are: > > > > iconveh_error > > iconveh_question_mark > > iconveh_escape_sequence > > > > The second option replaces any unconvertible character with a question mark > > "?". > > > > I would like to request to add a fourth option, say, > > iconveh_replacement_character, which is like iconveh_question_mark but > > uses U+FFFD whenever the target codeset is a Unicode codeset. > > That's a good suggestion, as nowadays people are frequently converting > to UTF-8 or GB18030. Implemented as follows. > > > 2022-01-01 Bruno Haible <br...@clisp.org> > > striconveh: Support an error handler that produces a Unicode U+FFFD. > Suggested by Marc Nieper-Wißkirchen in > <https://lists.gnu.org/archive/html/bug-gnulib/2021-12/msg00175.html>. > * lib/iconveh.h (iconveh_replacement_character): New enum value. > * lib/striconveh.c (mem_cd_iconveh_internal): When the handler is > iconveh_replacement_character, try to produce U+FFFD when possible, > instead of '?'. > * tests/test-striconveh.c (main): Add GB18030 tests. Test also > iconveh_replacement_character. > > diff --git a/lib/iconveh.h b/lib/iconveh.h > index d321d34cb..058f68ca2 100644 > --- a/lib/iconveh.h > +++ b/lib/iconveh.h > @@ -29,7 +29,10 @@ enum iconv_ilseq_handler > { > iconveh_error, /* return and set errno = EILSEQ */ > iconveh_question_mark, /* use one '?' per unconvertible character */ > - iconveh_escape_sequence /* use escape sequence \uxxxx or \Uxxxxxxxx > */ > + iconveh_escape_sequence, /* use escape sequence \uxxxx or \Uxxxxxxxx > */ > + iconveh_replacement_character /* use one U+FFFD per unconvertible character > + if that fits in the target encoding, > + otherwise one '?' */ > }; > > > diff --git a/lib/striconveh.c b/lib/striconveh.c > index 4aa8a2f07..612c38c3e 100644 > --- a/lib/striconveh.c > +++ b/lib/striconveh.c > @@ -457,13 +457,18 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, > if (cd2 == (iconv_t)(-1)) > { > /* TO_CODESET is UTF-8. */ > - /* Error handling can produce up to 1 byte of output. */ > - if (length + 1 + extra_alloc > allocated) > + /* Error handling can produce up to 1 or 3 bytes of > + output. */ > + size_t extra_need = > + (handler == iconveh_replacement_character ? 3 : 1); > + if (length + extra_need + extra_alloc > allocated) > { > char *memory; > > allocated = 2 * allocated; > - if (length + 1 + extra_alloc > allocated) > + if (length + extra_need + extra_alloc > allocated) > + allocated = 2 * allocated; > + if (length + extra_need + extra_alloc > allocated) > abort (); > if (result == initial_result) > memory = (char *) malloc (allocated); > @@ -482,7 +487,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, > grow = false; > } > /* The input is invalid in FROM_CODESET. Eat up one byte > - and emit a question mark. */ > + and emit a replacement character or a question mark. > */ > if (!incremented) > { > if (insize == 0) > @@ -490,8 +495,19 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, > inptr++; > insize--; > } > - result[length] = '?'; > - length++; > + if (handler == iconveh_replacement_character) > + { > + /* U+FFFD in UTF-8 encoding. */ > + result[length+0] = '\357'; > + result[length+1] = '\277'; > + result[length+2] = '\275'; > + length += 3; > + } > + else > + { > + result[length] = '?'; > + length++; > + } > } > else > goto indirectly; > @@ -594,7 +610,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, > { > const bool slowly = (offsets != NULL || handler == iconveh_error); > # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */ > - char utf8buf[utf8bufsize + 1]; > + char utf8buf[utf8bufsize + 3]; > size_t utf8len = 0; > const char *in1ptr = src; > size_t in1size = srclen; > @@ -682,8 +698,8 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, > && errno == EILSEQ && handler != iconveh_error) > { > /* The input is invalid in FROM_CODESET. Eat up one byte and > - emit a question mark. Room for the question mark was > allocated > - at the end of utf8buf. */ > + emit a U+FFFD character or a question mark. Room for this > + character was allocated at the end of utf8buf. */ > if (!incremented1) > { > if (in1size == 0) > @@ -691,7 +707,16 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, > in1ptr++; > in1size--; > } > - *out1ptr++ = '?'; > + if (handler == iconveh_replacement_character) > + { > + /* U+FFFD in UTF-8 encoding. */ > + out1ptr[0] = '\357'; > + out1ptr[1] = '\277'; > + out1ptr[2] = '\275'; > + out1ptr += 3; > + } > + else > + *out1ptr++ = '?'; > res1 = 0; > } > errno1 = errno; > @@ -756,7 +781,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, > break; > else if (errno == EILSEQ && handler != iconveh_error) > { > - /* Error handling can produce up to 10 bytes of ASCII > + /* Error handling can produce up to 10 bytes of UTF-8 > output. But TO_CODESET may be UCS-2, UTF-16 or > UCS-4, so use CD2 here as well. */ > char scratchbuf[10]; > @@ -804,6 +829,14 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, > scratchbuf[scratchlen++] = hex[(uc>>4) & 15]; > scratchbuf[scratchlen++] = hex[uc & 15]; > } > + else if (handler == iconveh_replacement_character) > + { > + /* U+FFFD in UTF-8 encoding. */ > + scratchbuf[0] = '\357'; > + scratchbuf[1] = '\277'; > + scratchbuf[2] = '\275'; > + scratchlen = 3; > + } > else > { > scratchbuf[0] = '?'; > @@ -813,9 +846,24 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, > inptr = scratchbuf; > insize = scratchlen; > if (cd2 != (iconv_t)(-1)) > - res = iconv (cd2, > - (ICONV_CONST char **) &inptr, &insize, > - &out2ptr, &out2size); > + { > + res = iconv (cd2, > + (ICONV_CONST char **) &inptr, > &insize, > + &out2ptr, &out2size); > + if (handler == iconveh_replacement_character > + && res == (size_t)(-1) && errno == EILSEQ) > + { > + /* U+FFFD can't be converted to TO_CODESET. > + Use '?' instead. */ > + scratchbuf[0] = '?'; > + scratchlen = 1; > + inptr = scratchbuf; > + insize = scratchlen; > + res = iconv (cd2, > + (ICONV_CONST char **) &inptr, > &insize, > + &out2ptr, &out2size); > + } > + } > else > { > /* TO_CODESET is UTF-8. */ > diff --git a/tests/test-striconveh.c b/tests/test-striconveh.c > index 438b7b087..781aa5254 100644 > --- a/tests/test-striconveh.c > +++ b/tests/test-striconveh.c > @@ -46,14 +46,19 @@ main () > { > #if HAVE_ICONV > static enum iconv_ilseq_handler handlers[] = > - { iconveh_error, iconveh_question_mark, iconveh_escape_sequence }; > + { > + iconveh_error, > + iconveh_question_mark, > + iconveh_replacement_character, > + iconveh_escape_sequence > + }; > size_t indirect; > size_t h; > size_t o; > size_t i; > > /* Assume that iconv() supports at least the encodings ASCII, ISO-8859-1, > - ISO-8859-2, and UTF-8. */ > + ISO-8859-2, UTF-8, and with libiconv or glibc also GB18030. */ > iconv_t cd_ascii_to_88591 = iconv_open ("ISO-8859-1", "ASCII"); > iconv_t cd_88591_to_88592 = iconv_open ("ISO-8859-2", "ISO-8859-1"); > iconv_t cd_88592_to_88591 = iconv_open ("ISO-8859-1", "ISO-8859-2"); > @@ -63,6 +68,12 @@ main () > iconv_t cd_88592_to_utf8 = iconv_open ("UTF-8", "ISO-8859-2"); > iconv_t cd_utf8_to_88592 = iconv_open ("ISO-8859-2", "UTF-8"); > iconv_t cd_utf7_to_utf8 = iconv_open ("UTF-8", "UTF-7"); > +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) > + iconv_t cd_ascii_to_gb18030 = iconv_open ("GB18030", "ASCII"); > + iconv_t cd_utf8_to_gb18030 = iconv_open ("GB18030", "UTF-8"); > + iconv_t cd_88591_to_gb18030 = iconv_open ("GB18030", "ISO-8859-1"); > + iconv_t cd_utf7_to_gb18030 = iconv_open ("GB18030", "UTF-7"); > +# endif > iconveh_t cdeh_ascii_to_88591; > iconveh_t cdeh_ascii_to_88591_indirectly; > iconveh_t cdeh_88592_to_88591; > @@ -71,12 +82,21 @@ main () > iconveh_t cdeh_88591_to_utf8; > iconveh_t cdeh_utf8_to_88591; > iconveh_t cdeh_utf7_to_utf8; > +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) > + iconveh_t cdeh_ascii_to_gb18030; > + iconveh_t cdeh_88591_to_gb18030; > + iconveh_t cdeh_utf7_to_gb18030; > +# endif > > ASSERT (cd_ascii_to_utf8 != (iconv_t)(-1)); > ASSERT (cd_88591_to_utf8 != (iconv_t)(-1)); > ASSERT (cd_utf8_to_88591 != (iconv_t)(-1)); > ASSERT (cd_88592_to_utf8 != (iconv_t)(-1)); > ASSERT (cd_utf8_to_88592 != (iconv_t)(-1)); > +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) > + ASSERT (cd_ascii_to_gb18030 != (iconv_t)(-1)); > + ASSERT (cd_utf8_to_gb18030 != (iconv_t)(-1)); > +# endif > > cdeh_ascii_to_88591.cd = cd_ascii_to_88591; > cdeh_ascii_to_88591.cd1 = cd_ascii_to_utf8; > @@ -110,6 +130,20 @@ main () > cdeh_utf7_to_utf8.cd1 = cd_utf7_to_utf8; > cdeh_utf7_to_utf8.cd2 = (iconv_t)(-1); > > +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) > + cdeh_ascii_to_gb18030.cd = cd_ascii_to_gb18030; > + cdeh_ascii_to_gb18030.cd1 = cd_ascii_to_utf8; > + cdeh_ascii_to_gb18030.cd2 = cd_utf8_to_gb18030; > + > + cdeh_88591_to_gb18030.cd = cd_88591_to_gb18030; > + cdeh_88591_to_gb18030.cd1 = cd_88591_to_utf8; > + cdeh_88591_to_gb18030.cd2 = cd_utf8_to_gb18030; > + > + cdeh_utf7_to_gb18030.cd = cd_utf7_to_gb18030; > + cdeh_utf7_to_gb18030.cd1 = cd_utf7_to_utf8; > + cdeh_utf7_to_gb18030.cd2 = cd_utf8_to_gb18030; > +# endif > + > /* ------------------------ Test mem_cd_iconveh() ------------------------ > */ > > /* Test conversion from ISO-8859-2 to ISO-8859-1 with no errors. */ > @@ -175,6 +209,7 @@ main () > free (offsets); > break; > case iconveh_question_mark: > + case iconveh_replacement_character: > case iconveh_escape_sequence: > { > static const char expected[] = "Rafa? Maszkowski"; > @@ -224,6 +259,7 @@ main () > free (offsets); > break; > case iconveh_question_mark: > + case iconveh_replacement_character: > { > static const char expected[] = "Rafa? Maszkowski"; > ASSERT (retval == 0); > @@ -294,6 +330,41 @@ main () > } > } > > +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) > + /* Test conversion from ISO-8859-1 to GB18030 with no errors. */ > + for (h = 0; h < SIZEOF (handlers); h++) > + { > + enum iconv_ilseq_handler handler = handlers[h]; > + static const char input[] = "\304rger mit b\366sen B\374bchen ohne > Augenma\337"; > + static const char expected[] = "\2010\2072rger mit b\2010\2132sen > B\250\271bchen ohne Augenma\2010\2118"; > + for (o = 0; o < 2; o++) > + { > + size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); > + char *result = NULL; > + size_t length = 0; > + int retval = mem_cd_iconveh (input, strlen (input), > + &cdeh_88591_to_gb18030, > + handler, > + offsets, > + &result, &length); > + ASSERT (retval == 0); > + ASSERT (length == strlen (expected)); > + ASSERT (result != NULL && memcmp (result, expected, strlen > (expected)) == 0); > + if (o) > + { > + for (i = 0; i < 37; i++) > + ASSERT (offsets[i] == (i < 1 ? i : > + i < 12 ? i + 3 : > + i < 18 ? i + 6 : > + i + 7)); > + ASSERT (offsets[37] == MAGIC); > + free (offsets); > + } > + free (result); > + } > + } > +# endif > + > /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */ > for (h = 0; h < SIZEOF (handlers); h++) > { > @@ -371,10 +442,88 @@ main () > free (result); > } > break; > + case iconveh_replacement_character: > + { > + static const char expected[] = "Rafa\357\277\275 Maszkowski"; > + ASSERT (retval == 0); > + ASSERT (length == strlen (expected)); > + ASSERT (result != NULL && memcmp (result, expected, strlen > (expected)) == 0); > + if (o) > + { > + for (i = 0; i < 16; i++) > + ASSERT (offsets[i] == (i < 5 ? i : i + 2)); > + ASSERT (offsets[16] == MAGIC); > + free (offsets); > + } > + free (result); > + } > + break; > } > } > } > > +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) > + /* Test conversion from ASCII to GB18030 with invalid input (EILSEQ). */ > + for (h = 0; h < SIZEOF (handlers); h++) > + { > + enum iconv_ilseq_handler handler = handlers[h]; > + static const char input[] = "Rafa\263 Maszkowski"; /* Rafa? Maszkowski > */ > + for (o = 0; o < 2; o++) > + { > + size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); > + char *result = NULL; > + size_t length = 0; > + int retval = mem_cd_iconveh (input, strlen (input), > + &cdeh_ascii_to_gb18030, > + handler, > + offsets, > + &result, &length); > + switch (handler) > + { > + case iconveh_error: > + ASSERT (retval == -1 && errno == EILSEQ); > + ASSERT (result == NULL); > + if (o) > + free (offsets); > + break; > + case iconveh_question_mark: > + case iconveh_escape_sequence: > + { > + static const char expected[] = "Rafa? Maszkowski"; > + ASSERT (retval == 0); > + ASSERT (length == strlen (expected)); > + ASSERT (result != NULL && memcmp (result, expected, strlen > (expected)) == 0); > + if (o) > + { > + for (i = 0; i < 16; i++) > + ASSERT (offsets[i] == i); > + ASSERT (offsets[16] == MAGIC); > + free (offsets); > + } > + free (result); > + } > + break; > + case iconveh_replacement_character: > + { > + static const char expected[] = "Rafa\2041\2447 Maszkowski"; > + ASSERT (retval == 0); > + ASSERT (length == strlen (expected)); > + ASSERT (result != NULL && memcmp (result, expected, strlen > (expected)) == 0); > + if (o) > + { > + for (i = 0; i < 16; i++) > + ASSERT (offsets[i] == (i < 5 ? i : i + 3)); > + ASSERT (offsets[16] == MAGIC); > + free (offsets); > + } > + free (result); > + } > + break; > + } > + } > + } > +# endif > + > /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ. */ > for (h = 0; h < SIZEOF (handlers); h++) > { > @@ -399,6 +548,7 @@ main () > free (offsets); > break; > case iconveh_question_mark: > + case iconveh_replacement_character: > { > static const char expected[] = "Rafa? Maszkowski"; > ASSERT (retval == 0); > @@ -496,6 +646,34 @@ main () > free (result); > } > > +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) > + /* Test conversion from UTF-7 to GB18030 with EINVAL. */ > + for (h = 0; h < SIZEOF (handlers); h++) > + { > + enum iconv_ilseq_handler handler = handlers[h]; > + /* This is base64 encoded 0x54 0x32 0xD8 0x3F 0xD8 0x40. It would > + convert to U+5432 U+D83F U+D840 but these are Unicode > surrogates. */ > + static const char input[] = "+VDLYP9hA"; > + static const char expected1[] = "\337\305"; /* 吲 glibc */ > + static const char expected2[] = ""; /* libiconv */ > + char *result = NULL; > + size_t length = 0; > + int retval = mem_cd_iconveh (input, 7, > + &cdeh_utf7_to_gb18030, > + handler, > + NULL, > + &result, &length); > + ASSERT (retval == 0); > + ASSERT (length == strlen (expected1) || length == strlen > (expected2)); > + ASSERT (result != NULL); > + if (length == strlen (expected1)) > + ASSERT (memcmp (result, expected1, strlen (expected1)) == 0); > + else > + ASSERT (memcmp (result, expected2, strlen (expected2)) == 0); > + free (result); > + } > +# endif > + > /* Disabled on NetBSD, because NetBSD 5.0 iconv() is buggy: it converts > the input "+2D/YQNhB" to U+1FED8 U+3FD8 U+40D8. */ > # if !(defined __NetBSD__ && !defined _LIBICONV_VERSION) > @@ -544,8 +722,98 @@ main () > free (result); > } > break; > + case iconveh_replacement_character: > + { > + /* glibc result */ > + static const char expected1[] = > "\357\277\275\357\277\275\357\277\275\357\277\275\357\277\275"; > + /* libiconv <= 1.12 result */ > + static const char expected2[] = "\357\277\2752D/YQNhB"; > + /* libiconv >= 1.13 result */ > + static const char expected3[] = > "\357\277\275\340\277\266\341\200\266"; > + ASSERT (retval == 0); > + ASSERT (length == strlen (expected1) > + || length == strlen (expected2) > + || length == strlen (expected3)); > + ASSERT (result != NULL); > + if (length == strlen (expected1)) > + ASSERT (memcmp (result, expected1, strlen (expected1)) == > 0); > + else if (length == strlen (expected2)) > + ASSERT (memcmp (result, expected2, strlen (expected2)) == > 0); > + else > + ASSERT (memcmp (result, expected3, strlen (expected3)) == > 0); > + free (result); > + } > + } > + } > + > +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined > __UCLIBC__) > + /* Test conversion from UTF-7 to GB18030 with EILSEQ. */ > + for (h = 0; h < SIZEOF (handlers); h++) > + { > + enum iconv_ilseq_handler handler = handlers[h]; > + /* This is base64 encoded 0xD8 0x3F 0xD8 0x40 0xD8 0x41. It would > + convert to U+D83F U+D840 U+D841 but these are Unicode > surrogates. */ > + static const char input[] = "+2D/YQNhB"; > + char *result = NULL; > + size_t length = 0; > + int retval = mem_cd_iconveh (input, strlen (input), > + &cdeh_utf7_to_gb18030, > + handler, > + NULL, > + &result, &length); > + switch (handler) > + { > + case iconveh_error: > + ASSERT (retval == -1 && errno == EILSEQ); > + ASSERT (result == NULL); > + break; > + case iconveh_question_mark: > + case iconveh_escape_sequence: > + { > + /* glibc result */ > + static const char expected1[] = "?????"; > + /* libiconv <= 1.12 result */ > + static const char expected2[] = "?2D/YQNhB"; > + /* libiconv behaviour changed in version 1.13: the result is > + '?' U+0FF6 U+1036; this is U+D83F U+D840 U+D841 shifted > left > + by 6 bits. */ > + static const char expected3[] = "?\2013\2030\2013\2114"; > + ASSERT (retval == 0); > + ASSERT (length == strlen (expected1) > + || length == strlen (expected2) > + || length == strlen (expected3)); > + ASSERT (result != NULL); > + if (length == strlen (expected1)) > + ASSERT (memcmp (result, expected1, strlen (expected1)) == > 0); > + else if (length == strlen (expected2)) > + ASSERT (memcmp (result, expected2, strlen (expected2)) == 0 > + || memcmp (result, expected3, strlen (expected3)) > == 0); > + free (result); > + } > + break; > + case iconveh_replacement_character: > + { > + /* glibc result */ > + static const char expected1[] = > "\2041\2447\2041\2447\2041\2447\2041\2447\2041\2447"; > + /* libiconv <= 1.12 result */ > + static const char expected2[] = "\2041\24472D/YQNhB"; > + /* libiconv >= 1.13 result */ > + static const char expected3[] = > "\2041\2447\2013\2030\2013\2114"; > + ASSERT (retval == 0); > + ASSERT (length == strlen (expected1) > + || length == strlen (expected2) > + || length == strlen (expected3)); > + ASSERT (result != NULL); > + if (length == strlen (expected1)) > + ASSERT (memcmp (result, expected1, strlen (expected1)) == > 0); > + else if (length == strlen (expected2)) > + ASSERT (memcmp (result, expected2, strlen (expected2)) == 0 > + || memcmp (result, expected3, strlen (expected3)) > == 0); > + free (result); > + } > } > } > +# endif > # endif > # endif > } > @@ -589,6 +857,7 @@ main () > ASSERT (result == NULL && errno == EILSEQ); > break; > case iconveh_question_mark: > + case iconveh_replacement_character: > case iconveh_escape_sequence: > { > static const char expected[] = "Rafa? Maszkowski"; > @@ -619,6 +888,7 @@ main () > ASSERT (result == NULL && errno == EILSEQ); > break; > case iconveh_question_mark: > + case iconveh_replacement_character: > { > static const char expected[] = "Rafa? Maszkowski"; > ASSERT (result != NULL); > @@ -652,6 +922,22 @@ main () > free (result); > } > > +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) > + /* Test conversion from ISO-8859-1 to GB18030 with no errors. */ > + for (h = 0; h < SIZEOF (handlers); h++) > + { > + enum iconv_ilseq_handler handler = handlers[h]; > + static const char input[] = "\304rger mit b\366sen B\374bchen ohne > Augenma\337"; > + static const char expected[] = "\2010\2072rger mit b\2010\2132sen > B\250\271bchen ohne Augenma\2010\2118"; > + char *result = str_cd_iconveh (input, > + &cdeh_88591_to_gb18030, > + handler); > + ASSERT (result != NULL); > + ASSERT (strcmp (result, expected) == 0); > + free (result); > + } > +# endif > + > /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */ > for (h = 0; h < SIZEOF (handlers); h++) > { > @@ -688,8 +974,51 @@ main () > free (result); > } > break; > + case iconveh_replacement_character: > + { > + static const char expected[] = "Rafa\357\277\275 Maszkowski"; > + ASSERT (result != NULL); > + ASSERT (strcmp (result, expected) == 0); > + free (result); > + } > + break; > + } > + } > + > +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) > + /* Test conversion from ASCII to GB18030 with invalid input (EILSEQ). */ > + for (h = 0; h < SIZEOF (handlers); h++) > + { > + enum iconv_ilseq_handler handler = handlers[h]; > + static const char input[] = "Rafa\263 Maszkowski"; /* Rafa? Maszkowski > */ > + char *result = str_cd_iconveh (input, > + &cdeh_ascii_to_gb18030, > + handler); > + switch (handler) > + { > + case iconveh_error: > + ASSERT (result == NULL && errno == EILSEQ); > + break; > + case iconveh_question_mark: > + case iconveh_escape_sequence: > + { > + static const char expected[] = "Rafa? Maszkowski"; > + ASSERT (result != NULL); > + ASSERT (strcmp (result, expected) == 0); > + free (result); > + } > + break; > + case iconveh_replacement_character: > + { > + static const char expected[] = "Rafa\2041\2447 Maszkowski"; > + ASSERT (result != NULL); > + ASSERT (strcmp (result, expected) == 0); > + free (result); > + } > + break; > } > } > +# endif > > /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ. */ > for (h = 0; h < SIZEOF (handlers); h++) > @@ -705,6 +1034,7 @@ main () > ASSERT (result == NULL && errno == EILSEQ); > break; > case iconveh_question_mark: > + case iconveh_replacement_character: > { > static const char expected[] = "Costs: 27 ?"; > ASSERT (result != NULL); > @@ -801,6 +1131,7 @@ main () > free (offsets); > break; > case iconveh_question_mark: > + case iconveh_replacement_character: > { > static const char expected[] = "Rafa? Maszkowski"; > ASSERT (retval == 0); > @@ -870,6 +1201,41 @@ main () > } > } > > +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) > + /* Test conversion from ISO-8859-1 to GB18030 with no errors. */ > + for (h = 0; h < SIZEOF (handlers); h++) > + { > + enum iconv_ilseq_handler handler = handlers[h]; > + static const char input[] = "\304rger mit b\366sen B\374bchen ohne > Augenma\337"; > + static const char expected[] = "\2010\2072rger mit b\2010\2132sen > B\250\271bchen ohne Augenma\2010\2118"; > + for (o = 0; o < 2; o++) > + { > + size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); > + char *result = NULL; > + size_t length = 0; > + int retval = mem_iconveh (input, strlen (input), > + "ISO-8859-1", "GB18030", > + handler, > + offsets, > + &result, &length); > + ASSERT (retval == 0); > + ASSERT (length == strlen (expected)); > + ASSERT (result != NULL && memcmp (result, expected, strlen > (expected)) == 0); > + if (o) > + { > + for (i = 0; i < 37; i++) > + ASSERT (offsets[i] == (i < 1 ? i : > + i < 12 ? i + 3 : > + i < 18 ? i + 6 : > + i + 7)); > + ASSERT (offsets[37] == MAGIC); > + free (offsets); > + } > + free (result); > + } > + } > +# endif > + > /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */ > for (h = 0; h < SIZEOF (handlers); h++) > { > @@ -931,6 +1297,7 @@ main () > free (offsets); > break; > case iconveh_question_mark: > + case iconveh_replacement_character: > { > static const char expected[] = "Rafa? Maszkowski"; > ASSERT (retval == 0); > @@ -1023,6 +1390,7 @@ main () > ASSERT (result == NULL && errno == EILSEQ); > break; > case iconveh_question_mark: > + case iconveh_replacement_character: > { > static const char expected[] = "Rafa? Maszkowski"; > ASSERT (result != NULL); > @@ -1053,6 +1421,20 @@ main () > free (result); > } > > +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) > + /* Test conversion from ISO-8859-1 to GB18030 with no errors. */ > + for (h = 0; h < SIZEOF (handlers); h++) > + { > + enum iconv_ilseq_handler handler = handlers[h]; > + static const char input[] = "\304rger mit b\366sen B\374bchen ohne > Augenma\337"; > + static const char expected[] = "\2010\2072rger mit b\2010\2132sen > B\250\271bchen ohne Augenma\2010\2118"; > + char *result = str_iconveh (input, "ISO-8859-1", "GB18030", handler); > + ASSERT (result != NULL); > + ASSERT (strcmp (result, expected) == 0); > + free (result); > + } > +# endif > + > /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */ > for (h = 0; h < SIZEOF (handlers); h++) > { > @@ -1077,6 +1459,7 @@ main () > ASSERT (result == NULL && errno == EILSEQ); > break; > case iconveh_question_mark: > + case iconveh_replacement_character: > { > static const char expected[] = "Costs: 27 ?"; > ASSERT (result != NULL); > > >