Re: [striconveh] Error handling and Unicode replacement character

Marc Nieper-Wißkirchen Wed, 05 Jan 2022 02:07:44 -0800

Dear Bruno,

thank you for responding so quickly and for this addition!


Marc

Am Sa., 1. Jan. 2022 um 19:55 Uhr schrieb Bruno Haible <br...@clisp.org>:
>
> Marc Nieper-Wißkirchen wrote on 2021-12-30:
> > The striconveh module and related modules offer an error handler
> > argument. The current possible values are:
> >
> > iconveh_error
> > iconveh_question_mark
> > iconveh_escape_sequence
> >
> > The second option replaces any unconvertible character with a question mark 
> > "?".
> >
> > I would like to request to add a fourth option, say,
> > iconveh_replacement_character, which is like iconveh_question_mark but
> > uses U+FFFD whenever the target codeset is a Unicode codeset.
>
> That's a good suggestion, as nowadays people are frequently converting
> to UTF-8 or GB18030. Implemented as follows.
>
>
> 2022-01-01  Bruno Haible  <br...@clisp.org>
>
>         striconveh: Support an error handler that produces a Unicode U+FFFD.
>         Suggested by Marc Nieper-Wißkirchen in
>         <https://lists.gnu.org/archive/html/bug-gnulib/2021-12/msg00175.html>.
>         * lib/iconveh.h (iconveh_replacement_character): New enum value.
>         * lib/striconveh.c (mem_cd_iconveh_internal): When the handler is
>         iconveh_replacement_character, try to produce U+FFFD when possible,
>         instead of '?'.
>         * tests/test-striconveh.c (main): Add GB18030 tests. Test also
>         iconveh_replacement_character.
>
> diff --git a/lib/iconveh.h b/lib/iconveh.h
> index d321d34cb..058f68ca2 100644
> --- a/lib/iconveh.h
> +++ b/lib/iconveh.h
> @@ -29,7 +29,10 @@ enum iconv_ilseq_handler
>  {
>    iconveh_error,                /* return and set errno = EILSEQ */
>    iconveh_question_mark,        /* use one '?' per unconvertible character */
> -  iconveh_escape_sequence       /* use escape sequence \uxxxx or \Uxxxxxxxx 
> */
> +  iconveh_escape_sequence,      /* use escape sequence \uxxxx or \Uxxxxxxxx 
> */
> +  iconveh_replacement_character /* use one U+FFFD per unconvertible character
> +                                   if that fits in the target encoding,
> +                                   otherwise one '?' */
>  };
>
>
> diff --git a/lib/striconveh.c b/lib/striconveh.c
> index 4aa8a2f07..612c38c3e 100644
> --- a/lib/striconveh.c
> +++ b/lib/striconveh.c
> @@ -457,13 +457,18 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
>                  if (cd2 == (iconv_t)(-1))
>                    {
>                      /* TO_CODESET is UTF-8.  */
> -                    /* Error handling can produce up to 1 byte of output.  */
> -                    if (length + 1 + extra_alloc > allocated)
> +                    /* Error handling can produce up to 1 or 3 bytes of
> +                       output.  */
> +                    size_t extra_need =
> +                      (handler == iconveh_replacement_character ? 3 : 1);
> +                    if (length + extra_need + extra_alloc > allocated)
>                        {
>                          char *memory;
>
>                          allocated = 2 * allocated;
> -                        if (length + 1 + extra_alloc > allocated)
> +                        if (length + extra_need + extra_alloc > allocated)
> +                          allocated = 2 * allocated;
> +                        if (length + extra_need + extra_alloc > allocated)
>                            abort ();
>                          if (result == initial_result)
>                            memory = (char *) malloc (allocated);
> @@ -482,7 +487,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
>                          grow = false;
>                        }
>                      /* The input is invalid in FROM_CODESET.  Eat up one byte
> -                       and emit a question mark.  */
> +                       and emit a replacement character or a question mark.  
> */
>                      if (!incremented)
>                        {
>                          if (insize == 0)
> @@ -490,8 +495,19 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
>                          inptr++;
>                          insize--;
>                        }
> -                    result[length] = '?';
> -                    length++;
> +                    if (handler == iconveh_replacement_character)
> +                      {
> +                        /* U+FFFD in UTF-8 encoding.  */
> +                        result[length+0] = '\357';
> +                        result[length+1] = '\277';
> +                        result[length+2] = '\275';
> +                        length += 3;
> +                      }
> +                    else
> +                      {
> +                        result[length] = '?';
> +                        length++;
> +                      }
>                    }
>                  else
>                    goto indirectly;
> @@ -594,7 +610,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
>    {
>      const bool slowly = (offsets != NULL || handler == iconveh_error);
>  # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
> -    char utf8buf[utf8bufsize + 1];
> +    char utf8buf[utf8bufsize + 3];
>      size_t utf8len = 0;
>      const char *in1ptr = src;
>      size_t in1size = srclen;
> @@ -682,8 +698,8 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
>              && errno == EILSEQ && handler != iconveh_error)
>            {
>              /* The input is invalid in FROM_CODESET.  Eat up one byte and
> -               emit a question mark.  Room for the question mark was 
> allocated
> -               at the end of utf8buf.  */
> +               emit a U+FFFD character or a question mark.  Room for this
> +               character was allocated at the end of utf8buf.  */
>              if (!incremented1)
>                {
>                  if (in1size == 0)
> @@ -691,7 +707,16 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
>                  in1ptr++;
>                  in1size--;
>                }
> -            *out1ptr++ = '?';
> +            if (handler == iconveh_replacement_character)
> +              {
> +                /* U+FFFD in UTF-8 encoding.  */
> +                out1ptr[0] = '\357';
> +                out1ptr[1] = '\277';
> +                out1ptr[2] = '\275';
> +                out1ptr += 3;
> +              }
> +            else
> +              *out1ptr++ = '?';
>              res1 = 0;
>            }
>          errno1 = errno;
> @@ -756,7 +781,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
>                        break;
>                      else if (errno == EILSEQ && handler != iconveh_error)
>                        {
> -                        /* Error handling can produce up to 10 bytes of ASCII
> +                        /* Error handling can produce up to 10 bytes of UTF-8
>                             output.  But TO_CODESET may be UCS-2, UTF-16 or
>                             UCS-4, so use CD2 here as well.  */
>                          char scratchbuf[10];
> @@ -804,6 +829,14 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
>                              scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
>                              scratchbuf[scratchlen++] = hex[uc & 15];
>                            }
> +                        else if (handler == iconveh_replacement_character)
> +                          {
> +                            /* U+FFFD in UTF-8 encoding.  */
> +                            scratchbuf[0] = '\357';
> +                            scratchbuf[1] = '\277';
> +                            scratchbuf[2] = '\275';
> +                            scratchlen = 3;
> +                          }
>                          else
>                            {
>                              scratchbuf[0] = '?';
> @@ -813,9 +846,24 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
>                          inptr = scratchbuf;
>                          insize = scratchlen;
>                          if (cd2 != (iconv_t)(-1))
> -                          res = iconv (cd2,
> -                                       (ICONV_CONST char **) &inptr, &insize,
> -                                       &out2ptr, &out2size);
> +                          {
> +                            res = iconv (cd2,
> +                                         (ICONV_CONST char **) &inptr, 
> &insize,
> +                                         &out2ptr, &out2size);
> +                            if (handler == iconveh_replacement_character
> +                                && res == (size_t)(-1) && errno == EILSEQ)
> +                              {
> +                                 /* U+FFFD can't be converted to TO_CODESET.
> +                                    Use '?' instead.  */
> +                                scratchbuf[0] = '?';
> +                                scratchlen = 1;
> +                                inptr = scratchbuf;
> +                                insize = scratchlen;
> +                                res = iconv (cd2,
> +                                             (ICONV_CONST char **) &inptr, 
> &insize,
> +                                             &out2ptr, &out2size);
> +                              }
> +                          }
>                          else
>                            {
>                              /* TO_CODESET is UTF-8.  */
> diff --git a/tests/test-striconveh.c b/tests/test-striconveh.c
> index 438b7b087..781aa5254 100644
> --- a/tests/test-striconveh.c
> +++ b/tests/test-striconveh.c
> @@ -46,14 +46,19 @@ main ()
>  {
>  #if HAVE_ICONV
>    static enum iconv_ilseq_handler handlers[] =
> -    { iconveh_error, iconveh_question_mark, iconveh_escape_sequence };
> +    {
> +      iconveh_error,
> +      iconveh_question_mark,
> +      iconveh_replacement_character,
> +      iconveh_escape_sequence
> +    };
>    size_t indirect;
>    size_t h;
>    size_t o;
>    size_t i;
>
>    /* Assume that iconv() supports at least the encodings ASCII, ISO-8859-1,
> -     ISO-8859-2, and UTF-8.  */
> +     ISO-8859-2, UTF-8, and with libiconv or glibc also GB18030.  */
>    iconv_t cd_ascii_to_88591 = iconv_open ("ISO-8859-1", "ASCII");
>    iconv_t cd_88591_to_88592 = iconv_open ("ISO-8859-2", "ISO-8859-1");
>    iconv_t cd_88592_to_88591 = iconv_open ("ISO-8859-1", "ISO-8859-2");
> @@ -63,6 +68,12 @@ main ()
>    iconv_t cd_88592_to_utf8 = iconv_open ("UTF-8", "ISO-8859-2");
>    iconv_t cd_utf8_to_88592 = iconv_open ("ISO-8859-2", "UTF-8");
>    iconv_t cd_utf7_to_utf8 = iconv_open ("UTF-8", "UTF-7");
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> +  iconv_t cd_ascii_to_gb18030 = iconv_open ("GB18030", "ASCII");
> +  iconv_t cd_utf8_to_gb18030 = iconv_open ("GB18030", "UTF-8");
> +  iconv_t cd_88591_to_gb18030 = iconv_open ("GB18030", "ISO-8859-1");
> +  iconv_t cd_utf7_to_gb18030 = iconv_open ("GB18030", "UTF-7");
> +# endif
>    iconveh_t cdeh_ascii_to_88591;
>    iconveh_t cdeh_ascii_to_88591_indirectly;
>    iconveh_t cdeh_88592_to_88591;
> @@ -71,12 +82,21 @@ main ()
>    iconveh_t cdeh_88591_to_utf8;
>    iconveh_t cdeh_utf8_to_88591;
>    iconveh_t cdeh_utf7_to_utf8;
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> +  iconveh_t cdeh_ascii_to_gb18030;
> +  iconveh_t cdeh_88591_to_gb18030;
> +  iconveh_t cdeh_utf7_to_gb18030;
> +# endif
>
>    ASSERT (cd_ascii_to_utf8 != (iconv_t)(-1));
>    ASSERT (cd_88591_to_utf8 != (iconv_t)(-1));
>    ASSERT (cd_utf8_to_88591 != (iconv_t)(-1));
>    ASSERT (cd_88592_to_utf8 != (iconv_t)(-1));
>    ASSERT (cd_utf8_to_88592 != (iconv_t)(-1));
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> +  ASSERT (cd_ascii_to_gb18030 != (iconv_t)(-1));
> +  ASSERT (cd_utf8_to_gb18030 != (iconv_t)(-1));
> +# endif
>
>    cdeh_ascii_to_88591.cd = cd_ascii_to_88591;
>    cdeh_ascii_to_88591.cd1 = cd_ascii_to_utf8;
> @@ -110,6 +130,20 @@ main ()
>    cdeh_utf7_to_utf8.cd1 = cd_utf7_to_utf8;
>    cdeh_utf7_to_utf8.cd2 = (iconv_t)(-1);
>
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> +  cdeh_ascii_to_gb18030.cd = cd_ascii_to_gb18030;
> +  cdeh_ascii_to_gb18030.cd1 = cd_ascii_to_utf8;
> +  cdeh_ascii_to_gb18030.cd2 = cd_utf8_to_gb18030;
> +
> +  cdeh_88591_to_gb18030.cd = cd_88591_to_gb18030;
> +  cdeh_88591_to_gb18030.cd1 = cd_88591_to_utf8;
> +  cdeh_88591_to_gb18030.cd2 = cd_utf8_to_gb18030;
> +
> +  cdeh_utf7_to_gb18030.cd = cd_utf7_to_gb18030;
> +  cdeh_utf7_to_gb18030.cd1 = cd_utf7_to_utf8;
> +  cdeh_utf7_to_gb18030.cd2 = cd_utf8_to_gb18030;
> +# endif
> +
>    /* ------------------------ Test mem_cd_iconveh() ------------------------ 
> */
>
>    /* Test conversion from ISO-8859-2 to ISO-8859-1 with no errors.  */
> @@ -175,6 +209,7 @@ main ()
>                      free (offsets);
>                    break;
>                  case iconveh_question_mark:
> +                case iconveh_replacement_character:
>                  case iconveh_escape_sequence:
>                    {
>                      static const char expected[] = "Rafa? Maszkowski";
> @@ -224,6 +259,7 @@ main ()
>                      free (offsets);
>                    break;
>                  case iconveh_question_mark:
> +                case iconveh_replacement_character:
>                    {
>                      static const char expected[] = "Rafa? Maszkowski";
>                      ASSERT (retval == 0);
> @@ -294,6 +330,41 @@ main ()
>          }
>      }
>
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> +  /* Test conversion from ISO-8859-1 to GB18030 with no errors.  */
> +  for (h = 0; h < SIZEOF (handlers); h++)
> +    {
> +      enum iconv_ilseq_handler handler = handlers[h];
> +      static const char input[] = "\304rger mit b\366sen B\374bchen ohne 
> Augenma\337";
> +      static const char expected[] = "\2010\2072rger mit b\2010\2132sen 
> B\250\271bchen ohne Augenma\2010\2118";
> +      for (o = 0; o < 2; o++)
> +        {
> +          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
> +          char *result = NULL;
> +          size_t length = 0;
> +          int retval = mem_cd_iconveh (input, strlen (input),
> +                                       &cdeh_88591_to_gb18030,
> +                                       handler,
> +                                       offsets,
> +                                       &result, &length);
> +          ASSERT (retval == 0);
> +          ASSERT (length == strlen (expected));
> +          ASSERT (result != NULL && memcmp (result, expected, strlen 
> (expected)) == 0);
> +          if (o)
> +            {
> +              for (i = 0; i < 37; i++)
> +                ASSERT (offsets[i] == (i < 1 ? i :
> +                                       i < 12 ? i + 3 :
> +                                       i < 18 ? i + 6 :
> +                                       i + 7));
> +              ASSERT (offsets[37] == MAGIC);
> +              free (offsets);
> +            }
> +          free (result);
> +        }
> +    }
> +# endif
> +
>    /* Test conversion from UTF-8 to ISO-8859-1 with no errors.  */
>    for (h = 0; h < SIZEOF (handlers); h++)
>      {
> @@ -371,10 +442,88 @@ main ()
>                  free (result);
>                }
>                break;
> +            case iconveh_replacement_character:
> +              {
> +                static const char expected[] = "Rafa\357\277\275 Maszkowski";
> +                ASSERT (retval == 0);
> +                ASSERT (length == strlen (expected));
> +                ASSERT (result != NULL && memcmp (result, expected, strlen 
> (expected)) == 0);
> +                if (o)
> +                  {
> +                    for (i = 0; i < 16; i++)
> +                      ASSERT (offsets[i] == (i < 5 ? i : i + 2));
> +                    ASSERT (offsets[16] == MAGIC);
> +                    free (offsets);
> +                  }
> +                free (result);
> +              }
> +              break;
>              }
>          }
>      }
>
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> +  /* Test conversion from ASCII to GB18030 with invalid input (EILSEQ).  */
> +  for (h = 0; h < SIZEOF (handlers); h++)
> +    {
> +      enum iconv_ilseq_handler handler = handlers[h];
> +      static const char input[] = "Rafa\263 Maszkowski"; /* Rafa? Maszkowski 
> */
> +      for (o = 0; o < 2; o++)
> +        {
> +          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
> +          char *result = NULL;
> +          size_t length = 0;
> +          int retval = mem_cd_iconveh (input, strlen (input),
> +                                       &cdeh_ascii_to_gb18030,
> +                                       handler,
> +                                       offsets,
> +                                       &result, &length);
> +          switch (handler)
> +            {
> +            case iconveh_error:
> +              ASSERT (retval == -1 && errno == EILSEQ);
> +              ASSERT (result == NULL);
> +              if (o)
> +                free (offsets);
> +              break;
> +            case iconveh_question_mark:
> +            case iconveh_escape_sequence:
> +              {
> +                static const char expected[] = "Rafa? Maszkowski";
> +                ASSERT (retval == 0);
> +                ASSERT (length == strlen (expected));
> +                ASSERT (result != NULL && memcmp (result, expected, strlen 
> (expected)) == 0);
> +                if (o)
> +                  {
> +                    for (i = 0; i < 16; i++)
> +                      ASSERT (offsets[i] == i);
> +                    ASSERT (offsets[16] == MAGIC);
> +                    free (offsets);
> +                  }
> +                free (result);
> +              }
> +              break;
> +            case iconveh_replacement_character:
> +              {
> +                static const char expected[] = "Rafa\2041\2447 Maszkowski";
> +                ASSERT (retval == 0);
> +                ASSERT (length == strlen (expected));
> +                ASSERT (result != NULL && memcmp (result, expected, strlen 
> (expected)) == 0);
> +                if (o)
> +                  {
> +                    for (i = 0; i < 16; i++)
> +                      ASSERT (offsets[i] == (i < 5 ? i : i + 3));
> +                    ASSERT (offsets[16] == MAGIC);
> +                    free (offsets);
> +                  }
> +                free (result);
> +              }
> +              break;
> +            }
> +        }
> +    }
> +# endif
> +
>    /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ.  */
>    for (h = 0; h < SIZEOF (handlers); h++)
>      {
> @@ -399,6 +548,7 @@ main ()
>                  free (offsets);
>                break;
>              case iconveh_question_mark:
> +            case iconveh_replacement_character:
>                {
>                  static const char expected[] = "Rafa? Maszkowski";
>                  ASSERT (retval == 0);
> @@ -496,6 +646,34 @@ main ()
>            free (result);
>          }
>
> +#  if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> +      /* Test conversion from UTF-7 to GB18030 with EINVAL.  */
> +      for (h = 0; h < SIZEOF (handlers); h++)
> +        {
> +          enum iconv_ilseq_handler handler = handlers[h];
> +          /* This is base64 encoded 0x54 0x32 0xD8 0x3F 0xD8 0x40.  It would
> +             convert to U+5432 U+D83F U+D840 but these are Unicode 
> surrogates.  */
> +          static const char input[] = "+VDLYP9hA";
> +          static const char expected1[] = "\337\305"; /* 吲 glibc */
> +          static const char expected2[] = ""; /* libiconv */
> +          char *result = NULL;
> +          size_t length = 0;
> +          int retval = mem_cd_iconveh (input, 7,
> +                                       &cdeh_utf7_to_gb18030,
> +                                       handler,
> +                                       NULL,
> +                                       &result, &length);
> +          ASSERT (retval == 0);
> +          ASSERT (length == strlen (expected1) || length == strlen 
> (expected2));
> +          ASSERT (result != NULL);
> +          if (length == strlen (expected1))
> +            ASSERT (memcmp (result, expected1, strlen (expected1)) == 0);
> +          else
> +            ASSERT (memcmp (result, expected2, strlen (expected2)) == 0);
> +          free (result);
> +        }
> +#  endif
> +
>        /* Disabled on NetBSD, because NetBSD 5.0 iconv() is buggy: it converts
>           the input "+2D/YQNhB" to U+1FED8 U+3FD8 U+40D8.  */
>  #  if !(defined __NetBSD__ && !defined _LIBICONV_VERSION)
> @@ -544,8 +722,98 @@ main ()
>                  free (result);
>                }
>                break;
> +            case iconveh_replacement_character:
> +              {
> +                /* glibc result */
> +                static const char expected1[] = 
> "\357\277\275\357\277\275\357\277\275\357\277\275\357\277\275";
> +                /* libiconv <= 1.12 result */
> +                static const char expected2[] = "\357\277\2752D/YQNhB";
> +                /* libiconv >= 1.13 result */
> +                static const char expected3[] = 
> "\357\277\275\340\277\266\341\200\266";
> +                ASSERT (retval == 0);
> +                ASSERT (length == strlen (expected1)
> +                        || length == strlen (expected2)
> +                        || length == strlen (expected3));
> +                ASSERT (result != NULL);
> +                if (length == strlen (expected1))
> +                  ASSERT (memcmp (result, expected1, strlen (expected1)) == 
> 0);
> +                else if (length == strlen (expected2))
> +                  ASSERT (memcmp (result, expected2, strlen (expected2)) == 
> 0);
> +                else
> +                  ASSERT (memcmp (result, expected3, strlen (expected3)) == 
> 0);
> +                free (result);
> +              }
> +            }
> +        }
> +
> +#   if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined 
> __UCLIBC__)
> +      /* Test conversion from UTF-7 to GB18030 with EILSEQ.  */
> +      for (h = 0; h < SIZEOF (handlers); h++)
> +        {
> +          enum iconv_ilseq_handler handler = handlers[h];
> +          /* This is base64 encoded 0xD8 0x3F 0xD8 0x40 0xD8 0x41.  It would
> +             convert to U+D83F U+D840 U+D841 but these are Unicode 
> surrogates.  */
> +          static const char input[] = "+2D/YQNhB";
> +          char *result = NULL;
> +          size_t length = 0;
> +          int retval = mem_cd_iconveh (input, strlen (input),
> +                                       &cdeh_utf7_to_gb18030,
> +                                       handler,
> +                                       NULL,
> +                                       &result, &length);
> +          switch (handler)
> +            {
> +            case iconveh_error:
> +              ASSERT (retval == -1 && errno == EILSEQ);
> +              ASSERT (result == NULL);
> +              break;
> +            case iconveh_question_mark:
> +            case iconveh_escape_sequence:
> +              {
> +                /* glibc result */
> +                static const char expected1[] = "?????";
> +                /* libiconv <= 1.12 result */
> +                static const char expected2[] = "?2D/YQNhB";
> +                /* libiconv behaviour changed in version 1.13: the result is
> +                   '?' U+0FF6 U+1036; this is U+D83F U+D840 U+D841 shifted 
> left
> +                   by 6 bits.  */
> +                static const char expected3[] = "?\2013\2030\2013\2114";
> +                ASSERT (retval == 0);
> +                ASSERT (length == strlen (expected1)
> +                        || length == strlen (expected2)
> +                        || length == strlen (expected3));
> +                ASSERT (result != NULL);
> +                if (length == strlen (expected1))
> +                  ASSERT (memcmp (result, expected1, strlen (expected1)) == 
> 0);
> +                else if (length == strlen (expected2))
> +                  ASSERT (memcmp (result, expected2, strlen (expected2)) == 0
> +                          || memcmp (result, expected3, strlen (expected3)) 
> == 0);
> +                free (result);
> +              }
> +              break;
> +            case iconveh_replacement_character:
> +              {
> +                /* glibc result */
> +                static const char expected1[] = 
> "\2041\2447\2041\2447\2041\2447\2041\2447\2041\2447";
> +                /* libiconv <= 1.12 result */
> +                static const char expected2[] = "\2041\24472D/YQNhB";
> +                /* libiconv >= 1.13 result */
> +                static const char expected3[] = 
> "\2041\2447\2013\2030\2013\2114";
> +                ASSERT (retval == 0);
> +                ASSERT (length == strlen (expected1)
> +                        || length == strlen (expected2)
> +                        || length == strlen (expected3));
> +                ASSERT (result != NULL);
> +                if (length == strlen (expected1))
> +                  ASSERT (memcmp (result, expected1, strlen (expected1)) == 
> 0);
> +                else if (length == strlen (expected2))
> +                  ASSERT (memcmp (result, expected2, strlen (expected2)) == 0
> +                          || memcmp (result, expected3, strlen (expected3)) 
> == 0);
> +                free (result);
> +              }
>              }
>          }
> +#   endif
>  #  endif
>  # endif
>      }
> @@ -589,6 +857,7 @@ main ()
>                ASSERT (result == NULL && errno == EILSEQ);
>                break;
>              case iconveh_question_mark:
> +            case iconveh_replacement_character:
>              case iconveh_escape_sequence:
>                {
>                  static const char expected[] = "Rafa? Maszkowski";
> @@ -619,6 +888,7 @@ main ()
>                ASSERT (result == NULL && errno == EILSEQ);
>                break;
>              case iconveh_question_mark:
> +            case iconveh_replacement_character:
>                {
>                  static const char expected[] = "Rafa? Maszkowski";
>                  ASSERT (result != NULL);
> @@ -652,6 +922,22 @@ main ()
>        free (result);
>      }
>
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> +  /* Test conversion from ISO-8859-1 to GB18030 with no errors.  */
> +  for (h = 0; h < SIZEOF (handlers); h++)
> +    {
> +      enum iconv_ilseq_handler handler = handlers[h];
> +      static const char input[] = "\304rger mit b\366sen B\374bchen ohne 
> Augenma\337";
> +      static const char expected[] = "\2010\2072rger mit b\2010\2132sen 
> B\250\271bchen ohne Augenma\2010\2118";
> +      char *result = str_cd_iconveh (input,
> +                                     &cdeh_88591_to_gb18030,
> +                                     handler);
> +      ASSERT (result != NULL);
> +      ASSERT (strcmp (result, expected) == 0);
> +      free (result);
> +    }
> +# endif
> +
>    /* Test conversion from UTF-8 to ISO-8859-1 with no errors.  */
>    for (h = 0; h < SIZEOF (handlers); h++)
>      {
> @@ -688,8 +974,51 @@ main ()
>              free (result);
>            }
>            break;
> +        case iconveh_replacement_character:
> +          {
> +            static const char expected[] = "Rafa\357\277\275 Maszkowski";
> +            ASSERT (result != NULL);
> +            ASSERT (strcmp (result, expected) == 0);
> +            free (result);
> +          }
> +          break;
> +        }
> +    }
> +
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> +  /* Test conversion from ASCII to GB18030 with invalid input (EILSEQ).  */
> +  for (h = 0; h < SIZEOF (handlers); h++)
> +    {
> +      enum iconv_ilseq_handler handler = handlers[h];
> +      static const char input[] = "Rafa\263 Maszkowski"; /* Rafa? Maszkowski 
> */
> +      char *result = str_cd_iconveh (input,
> +                                     &cdeh_ascii_to_gb18030,
> +                                     handler);
> +      switch (handler)
> +        {
> +        case iconveh_error:
> +          ASSERT (result == NULL && errno == EILSEQ);
> +          break;
> +        case iconveh_question_mark:
> +        case iconveh_escape_sequence:
> +          {
> +            static const char expected[] = "Rafa? Maszkowski";
> +            ASSERT (result != NULL);
> +            ASSERT (strcmp (result, expected) == 0);
> +            free (result);
> +          }
> +          break;
> +        case iconveh_replacement_character:
> +          {
> +            static const char expected[] = "Rafa\2041\2447 Maszkowski";
> +            ASSERT (result != NULL);
> +            ASSERT (strcmp (result, expected) == 0);
> +            free (result);
> +          }
> +          break;
>          }
>      }
> +# endif
>
>    /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ.  */
>    for (h = 0; h < SIZEOF (handlers); h++)
> @@ -705,6 +1034,7 @@ main ()
>            ASSERT (result == NULL && errno == EILSEQ);
>            break;
>          case iconveh_question_mark:
> +        case iconveh_replacement_character:
>            {
>              static const char expected[] = "Costs: 27 ?";
>              ASSERT (result != NULL);
> @@ -801,6 +1131,7 @@ main ()
>                  free (offsets);
>                break;
>              case iconveh_question_mark:
> +            case iconveh_replacement_character:
>                {
>                  static const char expected[] = "Rafa? Maszkowski";
>                  ASSERT (retval == 0);
> @@ -870,6 +1201,41 @@ main ()
>          }
>      }
>
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> +  /* Test conversion from ISO-8859-1 to GB18030 with no errors.  */
> +  for (h = 0; h < SIZEOF (handlers); h++)
> +    {
> +      enum iconv_ilseq_handler handler = handlers[h];
> +      static const char input[] = "\304rger mit b\366sen B\374bchen ohne 
> Augenma\337";
> +      static const char expected[] = "\2010\2072rger mit b\2010\2132sen 
> B\250\271bchen ohne Augenma\2010\2118";
> +      for (o = 0; o < 2; o++)
> +        {
> +          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
> +          char *result = NULL;
> +          size_t length = 0;
> +          int retval = mem_iconveh (input, strlen (input),
> +                                    "ISO-8859-1", "GB18030",
> +                                    handler,
> +                                    offsets,
> +                                    &result, &length);
> +          ASSERT (retval == 0);
> +          ASSERT (length == strlen (expected));
> +          ASSERT (result != NULL && memcmp (result, expected, strlen 
> (expected)) == 0);
> +          if (o)
> +            {
> +              for (i = 0; i < 37; i++)
> +                ASSERT (offsets[i] == (i < 1 ? i :
> +                                       i < 12 ? i + 3 :
> +                                       i < 18 ? i + 6 :
> +                                       i + 7));
> +              ASSERT (offsets[37] == MAGIC);
> +              free (offsets);
> +            }
> +          free (result);
> +        }
> +    }
> +# endif
> +
>    /* Test conversion from UTF-8 to ISO-8859-1 with no errors.  */
>    for (h = 0; h < SIZEOF (handlers); h++)
>      {
> @@ -931,6 +1297,7 @@ main ()
>                  free (offsets);
>                break;
>              case iconveh_question_mark:
> +            case iconveh_replacement_character:
>                {
>                  static const char expected[] = "Rafa? Maszkowski";
>                  ASSERT (retval == 0);
> @@ -1023,6 +1390,7 @@ main ()
>            ASSERT (result == NULL && errno == EILSEQ);
>            break;
>          case iconveh_question_mark:
> +        case iconveh_replacement_character:
>            {
>              static const char expected[] = "Rafa? Maszkowski";
>              ASSERT (result != NULL);
> @@ -1053,6 +1421,20 @@ main ()
>        free (result);
>      }
>
> +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__)
> +  /* Test conversion from ISO-8859-1 to GB18030 with no errors.  */
> +  for (h = 0; h < SIZEOF (handlers); h++)
> +    {
> +      enum iconv_ilseq_handler handler = handlers[h];
> +      static const char input[] = "\304rger mit b\366sen B\374bchen ohne 
> Augenma\337";
> +      static const char expected[] = "\2010\2072rger mit b\2010\2132sen 
> B\250\271bchen ohne Augenma\2010\2118";
> +      char *result = str_iconveh (input, "ISO-8859-1", "GB18030", handler);
> +      ASSERT (result != NULL);
> +      ASSERT (strcmp (result, expected) == 0);
> +      free (result);
> +    }
> +# endif
> +
>    /* Test conversion from UTF-8 to ISO-8859-1 with no errors.  */
>    for (h = 0; h < SIZEOF (handlers); h++)
>      {
> @@ -1077,6 +1459,7 @@ main ()
>            ASSERT (result == NULL && errno == EILSEQ);
>            break;
>          case iconveh_question_mark:
> +        case iconveh_replacement_character:
>            {
>              static const char expected[] = "Costs: 27 ?";
>              ASSERT (result != NULL);
>
>
>

Re: [striconveh] Error handling and Unicode replacement character

Reply via email to