Couple more thoughts about iconv, 

(1) The 'embedded nul' error is thrown by mkCharLenCE, after the real
conversion is complete. The converted string exists in memory, though
not in a form that R can currently represent as a STRSXP. Hence the
error when passed to mkCharLenCE.

(2) The patch I submitted needed some PROTECTion, it's fixed in the
patch below. Also, in one place I did not alter CHAR(x) where x may be a
RAWSXP, because that's the pointer type required, the alternative is
(const char*) RAW(x). Should this alternative be used?

(3) I don't see why the iconv 'sub' argument shouldn't work with raw
objects also, but I think this means the 'sub' argument should also
accept raw objects.

(4) Suppose iconv fails. If x were a STRSXP, the result is NA_STRING.
Should failure of iconv on a RAWSXP return R_NilValue?

-Matt

Index: src/library/base/R/New-Internal.R
===================================================================
--- src/library/base/R/New-Internal.R   (revision 52328)
+++ src/library/base/R/New-Internal.R   (working copy)
@@ -239,7 +239,7 @@
 
 iconv <- function(x, from = "", to = "", sub = NA, mark = TRUE)
 {
-    if(!is.character(x)) x <- as.character(x)
+    if(!is.character(x) && !is.raw(x)) x <- as.character(x)
     .Internal(iconv(x, from, to, as.character(sub), mark))
 }
 
Index: src/main/sysutils.c
===================================================================
--- src/main/sysutils.c (revision 52328)
+++ src/main/sysutils.c (working copy)
@@ -548,16 +548,17 @@
        int mark;
        const char *from, *to;
        Rboolean isLatin1 = FALSE, isUTF8 = FALSE;
+       Rboolean isRawx = (TYPEOF(x) == RAWSXP);
 
-       if(TYPEOF(x) != STRSXP)
-           error(_("'x' must be a character vector"));
+       if(TYPEOF(x) != STRSXP && !isRawx)
+           error(_("'x' must be a character vector or raw"));
        if(!isString(CADR(args)) || length(CADR(args)) != 1)
            error(_("invalid '%s' argument"), "from");
        if(!isString(CADDR(args)) || length(CADDR(args)) != 1)
            error(_("invalid '%s' argument"), "to");
        if(!isString(CADDDR(args)) || length(CADDDR(args)) != 1)
            error(_("invalid '%s' argument"), "sub");
-       if(STRING_ELT(CADDDR(args), 0) == NA_STRING) sub = NULL;
+       if(STRING_ELT(CADDDR(args), 0) == NA_STRING || isRawx) sub = NULL;
        else sub = translateChar(STRING_ELT(CADDDR(args), 0));
        mark = asLogical(CAD4R(args));
        if(mark == NA_LOGICAL)
@@ -584,7 +585,7 @@
        PROTECT(ans = duplicate(x));
        R_AllocStringBuffer(0, &cbuff);  /* 0 -> default */
        for(i = 0; i < LENGTH(x); i++) {
-           si = STRING_ELT(x, i);
+           si = isRawx ? x : STRING_ELT(x, i);
        top_of_loop:
            inbuf = CHAR(si); inb = LENGTH(si);
            outbuf = cbuff.data; outb = cbuff.bufsize - 1;
@@ -622,7 +623,7 @@
                goto next_char;
            }
 
-           if(res != -1 && inb == 0) {
+           if(res != -1 && inb == 0 && !isRawx) {
                cetype_t ienc = CE_NATIVE;
 
                nout = cbuff.bufsize - 1 - outb;
@@ -632,7 +633,13 @@
                }
                SET_STRING_ELT(ans, i, mkCharLenCE(cbuff.data, nout, ienc));
            }
-           else SET_STRING_ELT(ans, i, NA_STRING);
+           else if(!isRawx) SET_STRING_ELT(ans, i, NA_STRING);
+           else {
+               nout = cbuff.bufsize - 1 - outb;
+               UNPROTECT(1);
+               PROTECT(ans = allocVector(RAWSXP, nout));
+               memcpy(RAW(ans), cbuff.data, nout);
+           }
        }
        Riconv_close(obj);
        R_FreeStringBuffer(&cbuff);

On Sat, 2010-06-19 at 16:53 -0400, Matt Shotwell wrote:
> R community,
> 
> As you may know, R's iconv doesn't work well converting to and from
> encodings that allow embedded nulls. For example
> 
> > iconv("foo", to="UTF-16")
> Error in iconv("foo", to = "UTF-16") : 
>   embedded nul in string: '\xff\xfef\0o\0o\0'
> 
> However, I don't believe embedded nulls are at issue here, but rather
> that R's iconv doesn't accept objects of type RAWSXP. The iconv
> mechanism, after all, operates on encoded binary data, and not
> necessarily null terminated C strings. I'd like to submit a very small
> patch (12 lines w/o documentation) that allows R's iconv to operate on
> raw objects, while not interfering or affecting the behavior of iconv on
> character vectors. To keep this message terse, I've put additional
> discussion, description of what the patch does, and examples here:
> http://biostatmatt.com/archives/456
> 
> Also, here is a link to the patch file:
> http://biostatmatt.com/R/R-devel-iconv-0.0.patch
> 
> If this change is adopted, I'd be happy to submit a documentation patch
> also.
> 
> -Matt
> 
> Index: src/library/base/R/New-Internal.R
> ===================================================================
> --- src/library/base/R/New-Internal.R (revision 52328)
> +++ src/library/base/R/New-Internal.R (working copy)
> @@ -239,7 +239,7 @@
>  
>  iconv <- function(x, from = "", to = "", sub = NA, mark = TRUE)
>  {
> -    if(!is.character(x)) x <- as.character(x)
> +    if(!is.character(x) && !is.raw(x)) x <- as.character(x)
>      .Internal(iconv(x, from, to, as.character(sub), mark))
>  }
>  
> Index: src/main/sysutils.c
> ===================================================================
> --- src/main/sysutils.c       (revision 52328)
> +++ src/main/sysutils.c       (working copy)
> @@ -548,16 +548,17 @@
>       int mark;
>       const char *from, *to;
>       Rboolean isLatin1 = FALSE, isUTF8 = FALSE;
> +     Rboolean isRawx = (TYPEOF(x) == RAWSXP);
>  
> -     if(TYPEOF(x) != STRSXP)
> -         error(_("'x' must be a character vector"));
> +     if(TYPEOF(x) != STRSXP && !isRawx)
> +         error(_("'x' must be a character vector or raw"));
>       if(!isString(CADR(args)) || length(CADR(args)) != 1)
>           error(_("invalid '%s' argument"), "from");
>       if(!isString(CADDR(args)) || length(CADDR(args)) != 1)
>           error(_("invalid '%s' argument"), "to");
>       if(!isString(CADDDR(args)) || length(CADDDR(args)) != 1)
>           error(_("invalid '%s' argument"), "sub");
> -     if(STRING_ELT(CADDDR(args), 0) == NA_STRING) sub = NULL;
> +     if(STRING_ELT(CADDDR(args), 0) == NA_STRING || isRawx) sub = NULL;
>       else sub = translateChar(STRING_ELT(CADDDR(args), 0));
>       mark = asLogical(CAD4R(args));
>       if(mark == NA_LOGICAL)
> @@ -584,7 +585,7 @@
>       PROTECT(ans = duplicate(x));
>       R_AllocStringBuffer(0, &cbuff);  /* 0 -> default */
>       for(i = 0; i < LENGTH(x); i++) {
> -         si = STRING_ELT(x, i);
> +         si = isRawx ? x : STRING_ELT(x, i);
>       top_of_loop:
>           inbuf = CHAR(si); inb = LENGTH(si);
>           outbuf = cbuff.data; outb = cbuff.bufsize - 1;
> @@ -622,7 +623,7 @@
>               goto next_char;
>           }
>  
> -         if(res != -1 && inb == 0) {
> +         if(res != -1 && inb == 0 && !isRawx) {
>               cetype_t ienc = CE_NATIVE;
>  
>               nout = cbuff.bufsize - 1 - outb;
> @@ -632,7 +633,12 @@
>               }
>               SET_STRING_ELT(ans, i, mkCharLenCE(cbuff.data, nout, ienc));
>           }
> -         else SET_STRING_ELT(ans, i, NA_STRING);
> +         else if(!isRawx) SET_STRING_ELT(ans, i, NA_STRING);
> +         else {
> +             nout = cbuff.bufsize - 1 - outb;
> +             ans = allocVector(RAWSXP, nout);
> +             memcpy(RAW(ans), cbuff.data, nout);
> +         }
>       }
>       Riconv_close(obj);
>       R_FreeStringBuffer(&cbuff);
> 
-- 
Matthew S. Shotwell
Graduate Student
Division of Biostatistics and Epidemiology
Medical University of South Carolina
http://biostatmatt.com

______________________________________________
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

Reply via email to