Hi, The iconv routines allow to convert strings, but lack an important functionality: the ability to transport information from the original string to the converted string or back. Such as word breaks, line breaking opportunities, or "wdiff" results.
This patch makes it possible. I add an optional 'offsets' argument to the general conversion functions in the modules 'striconveh', 'striconveha'. No need to make 'striconv' more complicated - users of 'striconv' can switch to 'striconveh' very easily. 2007-01-22 Bruno Haible <[EMAIL PROTECTED]> * lib/striconveh.h (mem_cd_iconveh, mem_iconveh): Add 'offsets' argument. * lib/striconveh.c (iconv_carefully_1): New function. (mem_cd_iconveh_internal, mem_cd_iconveh, mem_iconveh): Add 'offsets' argument. (str_cd_iconveh): Update. * lib/striconveha.h (mem_iconveha): Add 'offsets' argument. * lib/striconveha.c (mem_iconveha): Add 'offsets' argument. * tests/test-striconveh.c (MAGIC): New macro. (new_offsets): New function. (main): Test call with and without offsets. *** lib/striconveh.h 21 Jan 2007 22:58:01 -0000 1.4 --- lib/striconveh.h 23 Jan 2007 01:03:24 -0000 *************** *** 47,52 **** --- 47,56 ---- (iconv_t)(-1) if FROM_CODESET is UTF-8). CD2 is the conversion descriptor from UTF-8 to TO_CODESET (or (iconv_t)(-1) if TO_CODESET is UTF-8). + If OFFSETS is not NULL, it should point to an array of SRCLEN integers; this + array is filled with offsets into the result, i.e. the character starting + at SRC[i] corresponds to the character starting at (*RESULTP)[OFFSETS[i]], + and other offsets are set to (size_t)(-1). *RESULTP and *LENGTH should initially be a scratch buffer and its size, or *RESULTP can initially be NULL. May erase the contents of the memory at *RESULTP. *************** *** 58,63 **** --- 62,68 ---- mem_cd_iconveh (const char *src, size_t srclen, iconv_t cd, iconv_t cd1, iconv_t cd2, enum iconv_ilseq_handler handler, + size_t *offsets, char **resultp, size_t *lengthp); /* Convert an entire string from one encoding to another, using iconv. *************** *** 81,86 **** --- 86,95 ---- /* Convert an entire string from one encoding to another, using iconv. The original string is at [SRC,...,SRC+SRCLEN-1]. + If OFFSETS is not NULL, it should point to an array of SRCLEN integers; this + array is filled with offsets into the result, i.e. the character starting + at SRC[i] corresponds to the character starting at (*RESULTP)[OFFSETS[i]], + and other offsets are set to (size_t)(-1). *RESULTP and *LENGTH should initially be a scratch buffer and its size, or *RESULTP can initially be NULL. May erase the contents of the memory at *RESULTP. *************** *** 92,97 **** --- 101,107 ---- mem_iconveh (const char *src, size_t srclen, const char *from_codeset, const char *to_codeset, enum iconv_ilseq_handler handler, + size_t *offsets, char **resultp, size_t *lengthp); /* Convert an entire string from one encoding to another, using iconv. *** lib/striconveh.c 21 Jan 2007 22:59:19 -0000 1.5 --- lib/striconveh.c 23 Jan 2007 01:03:25 -0000 *************** *** 119,129 **** --- 119,186 ---- iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft)) # endif + /* iconv_carefully_1 is like iconv_carefully, except that it stops after + converting one character. */ + static size_t + iconv_carefully_1 (iconv_t cd, + const char **inbuf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft, + bool *incremented) + { + const char *inptr = *inbuf; + const char *inptr_end = inptr + *inbytesleft; + char *outptr = *outbuf; + size_t outsize = *outbytesleft; + const char *inptr_before = inptr; + size_t res = (size_t)(-1); + size_t insize; + + for (insize = 1; inptr + insize <= inptr_end; insize++) + { + res = iconv (cd, + (ICONV_CONST char **) &inptr, &insize, + &outptr, &outsize); + if (!(res == (size_t)(-1) && errno == EINVAL)) + break; + /* We expect that no input bytes have been consumed so far. */ + if (inptr != inptr_before) + abort (); + } + + *inbuf = inptr; + *inbytesleft = inptr_end - inptr; + # if !defined _LIBICONV_VERSION && !defined __GLIBC__ + /* Irix iconv() inserts a NUL byte if it cannot convert. + NetBSD iconv() inserts a question mark if it cannot convert. + Only GNU libiconv and GNU libc are known to prefer to fail rather + than doing a lossy conversion. */ + if (res != (size_t)(-1) && res > 0) + { + /* iconv() has already incremented INPTR. We cannot go back to a + previous INPTR, otherwise the state inside CD would become invalid, + if FROM_CODESET is a stateful encoding. So, tell the caller that + *INBUF has already been incremented. */ + *incremented = (inptr > inptr_before); + errno = EILSEQ; + return (size_t)(-1); + } + # endif + + if (res != (size_t)(-1)) + { + *outbuf = outptr; + *outbytesleft = outsize; + } + *incremented = false; + return res; + } + static int mem_cd_iconveh_internal (const char *src, size_t srclen, iconv_t cd, iconv_t cd1, iconv_t cd2, enum iconv_ilseq_handler handler, size_t extra_alloc, + size_t *offsets, char **resultp, size_t *lengthp) { /* When a conversion error occurs, we cannot start using CD1 and CD2 at *************** *** 141,146 **** --- 198,204 ---- char *result; size_t allocated; size_t length; + size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */ if (*lengthp >= sizeof (tmpbuf)) { *************** *** 153,158 **** --- 211,226 ---- allocated = sizeof (tmpbuf); } result = initial_result; + + if (offsets != NULL) + { + size_t i; + + for (i = 0; i < srclen; i++) + offsets[i] = (size_t)(-1); + + last_length = (size_t)(-1); + } length = 0; /* First, try a direct conversion, and see whether a conversion error *************** *** 176,191 **** size_t res; bool grow; ! /* Use iconv_carefully instead of iconv here, because: ! - If TO_CODESET is UTF-8, we can do the error handling in this loop, ! no need for a second loop, ! - With iconv() implementations other than GNU libiconv and GNU libc, ! if we use iconv() in a big swoop, checking for an E2BIG return, ! we lose the number of irreversible conversions. */ ! res = iconv_carefully (cd, ! &inptr, &insize, ! &outptr, &outsize, ! &incremented); length = outptr - result; grow = (length + extra_alloc > allocated / 2); --- 244,272 ---- size_t res; bool grow; ! if (offsets != NULL) ! { ! if (length != last_length) /* ensure that offset[] be increasing */ ! { ! offsets[inptr - src] = length; ! last_length = length; ! } ! res = iconv_carefully_1 (cd, ! &inptr, &insize, ! &outptr, &outsize, ! &incremented); ! } ! else ! /* Use iconv_carefully instead of iconv here, because: ! - If TO_CODESET is UTF-8, we can do the error handling in this ! loop, no need for a second loop, ! - With iconv() implementations other than GNU libiconv and GNU ! libc, if we use iconv() in a big swoop, checking for an E2BIG ! return, we lose the number of irreversible conversions. */ ! res = iconv_carefully (cd, ! &inptr, &insize, ! &outptr, &outsize, ! &incremented); length = outptr - result; grow = (length + extra_alloc > allocated / 2); *************** *** 332,337 **** --- 413,427 ---- /* The direct conversion failed, handler != iconveh_error, and cd2 != (iconv_t)(-1). Use a conversion through UTF-8. */ + if (offsets != NULL) + { + size_t i; + + for (i = 0; i < srclen; i++) + offsets[i] = (size_t)(-1); + + last_length = (size_t)(-1); + } length = 0; { # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */ *************** *** 362,372 **** /* Conversion step 1: from FROM_CODESET to UTF-8. */ if (in1size > 0) { if (cd1 != (iconv_t)(-1)) ! res1 = iconv_carefully (cd1, ! (ICONV_CONST char **) &in1ptr, &in1size, ! &out1ptr, &out1size, ! &incremented1); else { /* FROM_CODESET is UTF-8. */ --- 452,476 ---- /* Conversion step 1: from FROM_CODESET to UTF-8. */ if (in1size > 0) { + if (offsets != NULL + && length != last_length) /* ensure that offset[] be increasing */ + { + offsets[in1ptr - src] = length; + last_length = length; + } if (cd1 != (iconv_t)(-1)) ! { ! if (offsets != NULL) ! res1 = iconv_carefully_1 (cd1, ! &in1ptr, &in1size, ! &out1ptr, &out1size, ! &incremented1); ! else ! res1 = iconv_carefully (cd1, ! &in1ptr, &in1size, ! &out1ptr, &out1size, ! &incremented1); ! } else { /* FROM_CODESET is UTF-8. */ *************** *** 418,424 **** out1ptr += m; out1size -= m; } ! while (in1size > 0); } } else if (do_final_flush1) --- 522,528 ---- out1ptr += m; out1size -= m; } ! while (offsets == NULL && in1size > 0); } } else if (do_final_flush1) *************** *** 469,475 **** errno1 = errno; utf8len = out1ptr - utf8buf; ! if (in1size == 0 || utf8len > utf8bufsize / 2 || (res1 == (size_t)(-1) && errno1 == E2BIG)) { --- 573,580 ---- errno1 = errno; utf8len = out1ptr - utf8buf; ! if (offsets != NULL ! || in1size == 0 || utf8len > utf8bufsize / 2 || (res1 == (size_t)(-1) && errno1 == E2BIG)) { *************** *** 726,735 **** mem_cd_iconveh (const char *src, size_t srclen, iconv_t cd, iconv_t cd1, iconv_t cd2, enum iconv_ilseq_handler handler, char **resultp, size_t *lengthp) { return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0, ! resultp, lengthp); } char * --- 831,841 ---- mem_cd_iconveh (const char *src, size_t srclen, iconv_t cd, iconv_t cd1, iconv_t cd2, enum iconv_ilseq_handler handler, + size_t *offsets, char **resultp, size_t *lengthp) { return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0, ! offsets, resultp, lengthp); } char * *************** *** 744,750 **** char *result = NULL; size_t length = 0; int retval = mem_cd_iconveh_internal (src, strlen (src), ! cd, cd1, cd2, handler, 1, &result, &length); if (retval < 0) --- 850,856 ---- char *result = NULL; size_t length = 0; int retval = mem_cd_iconveh_internal (src, strlen (src), ! cd, cd1, cd2, handler, 1, NULL, &result, &length); if (retval < 0) *************** *** 770,775 **** --- 876,882 ---- mem_iconveh (const char *src, size_t srclen, const char *from_codeset, const char *to_codeset, enum iconv_ilseq_handler handler, + size_t *offsets, char **resultp, size_t *lengthp) { if (srclen == 0) *************** *** 778,784 **** *lengthp = 0; return 0; } ! else if (c_strcasecmp (from_codeset, to_codeset) == 0) { char *result; --- 885,891 ---- *lengthp = 0; return 0; } ! else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0) { char *result; *************** *** 854,861 **** result = *resultp; length = *lengthp; ! retval = ! mem_cd_iconveh (src, srclen, cd, cd1, cd2, handler, &result, &length); if (retval < 0) { --- 961,968 ---- result = *resultp; length = *lengthp; ! retval = mem_cd_iconveh (src, srclen, cd, cd1, cd2, handler, offsets, ! &result, &length); if (retval < 0) { *** lib/striconveha.h 21 Jan 2007 22:59:19 -0000 1.1 --- lib/striconveha.h 23 Jan 2007 01:03:25 -0000 *************** *** 30,35 **** --- 30,39 ---- /* Convert an entire string from one encoding to another, using iconv. The original string is at [SRC,...,SRC+SRCLEN-1]. The "from" encoding can also be a name defined for autodetection. + If OFFSETS is not NULL, it should point to an array of SRCLEN integers; this + array is filled with offsets into the result, i.e. the character starting + at SRC[i] corresponds to the character starting at (*RESULTP)[OFFSETS[i]], + and other offsets are set to (size_t)(-1). *RESULTP and *LENGTH should initially be a scratch buffer and its size, or *RESULTP can initially be NULL. May erase the contents of the memory at *RESULTP. *************** *** 41,46 **** --- 45,51 ---- mem_iconveha (const char *src, size_t srclen, const char *from_codeset, const char *to_codeset, enum iconv_ilseq_handler handler, + size_t *offsets, char **resultp, size_t *lengthp); /* Convert an entire string from one encoding to another, using iconv. *** lib/striconveha.c 21 Jan 2007 22:59:19 -0000 1.1 --- lib/striconveha.c 23 Jan 2007 01:03:25 -0000 *************** *** 147,156 **** mem_iconveha (const char *src, size_t srclen, const char *from_codeset, const char *to_codeset, enum iconv_ilseq_handler handler, char **resultp, size_t *lengthp) { int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler, ! resultp, lengthp); if (retval >= 0 || errno != EINVAL) return retval; else --- 147,157 ---- mem_iconveha (const char *src, size_t srclen, const char *from_codeset, const char *to_codeset, enum iconv_ilseq_handler handler, + size_t *offsets, char **resultp, size_t *lengthp) { int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler, ! offsets, resultp, lengthp); if (retval >= 0 || errno != EINVAL) return retval; else *************** *** 168,174 **** { retval = mem_iconveha (src, srclen, from_codeset, to_codeset, handler, ! resultp, lengthp); if (!(retval < 0 && errno == EILSEQ)) return retval; encodings++; --- 169,175 ---- { retval = mem_iconveha (src, srclen, from_codeset, to_codeset, handler, ! offsets, resultp, lengthp); if (!(retval < 0 && errno == EILSEQ)) return retval; encodings++;