Re: [Rd] [.data.frame speedup

Tim Hesterberg Tue, 01 Jul 2008 18:00:58 -0700

Here is a revised version of notSorted; change argument order (to be more
like
is.unsorted) and fix blunder.


notSorted <- function(x, na.rm = FALSE, decreasing = FALSE, strict = FALSE){
  # return TRUE if x is not sorted
  # If decreasing=FALSE, check for sort in increasing order
  # If strict=TRUE, ties correspond to not being sorted
  n <- length(x)
  if(n < 2)
    return(FALSE)
  if(!is.atomic(x) || (!na.rm && any(is.na(x))))
    return(NA)
  if(na.rm && any(ii <- is.na(x))){
    x <- x[!ii]
    n <- length(x)
  }
  if(decreasing){
    ifelse1(strict,
            any(x[-1] >= x[-n]),
            any(x[-1] >  x[-n]))
  } else { # check for sort in increasing order
    ifelse1(strict,
            any(x[-1] <= x[-n]),
            any(x[-1] <  x[-n]))
  }
}


On Tue, Jul 1, 2008 at 3:23 PM, Tim Hesterberg <[EMAIL PROTECTED]>
wrote:

> There is a bug in the standard version of [.data.frame;
> it mixes up handling duplicates and NAs when subscripting rows.
>   x <- data.frame(x=1:3, y=2:4, row.names=c("a","b","NA"))
>   y <- x[c(2:3, NA),]
>   y
> It creates a data frame with duplicate rows, but won't print.
>
> In the previous message I included a version of [.data.frame;
> it fails for the same example, for a different reason.  Here
> is a fix.
>
>
> "subscript.data.frame" <-
> function (x, i, j,
>           drop = if (missing(i)) TRUE else length(cols) == 1)
> {
>   # This version of [.data.frame avoid wasting time enforcing unique
>   # row names if possible.
>
>   mdrop <- missing(drop)
>   Narg <- nargs() - (!mdrop)
>   if (Narg < 3) {
>     if (!mdrop)
>       warning("drop argument will be ignored")
>     if (missing(i))
>       return(x)
>     if (is.matrix(i))
>       return(as.matrix(x)[i])
>     y <- NextMethod("[")
>     cols <- names(y)
>     if (!is.null(cols) && any(is.na(cols)))
>       stop("undefined columns selected")
>     if (any(duplicated(cols)))
>       names(y) <- make.unique(cols)
>     return(structure(y, class = oldClass(x),
>                      row.names = .row_names_info(x, 0L)))
>   }
>   if (missing(i)) {
>     if (missing(j) && drop && length(x) == 1L)
>       return(.subset2(x, 1L))
>     y <- if (missing(j))
>       x
>     else .subset(x, j)
>     if (drop && length(y) == 1L)
>       return(.subset2(y, 1L))
>     cols <- names(y)
>     if (any(is.na(cols)))
>       stop("undefined columns selected")
>     if (any(duplicated(cols)))
>       names(y) <- make.unique(cols)
>     nrow <- .row_names_info(x, 2L)
>     if (drop && !mdrop && nrow == 1L)
>       return(structure(y, class = NULL, row.names = NULL))
>     else return(structure(y, class = oldClass(x),
>                           row.names = .row_names_info(x, 0L)))
>   }
>   xx <- x
>   cols <- names(xx)
>   x <- vector("list", length(x))
>   x <- .Call("R_copyDFattr", xx, x, PACKAGE = "base")
>   oldClass(x) <- attr(x, "row.names") <- NULL
>   # Do not want to check for duplicates if don't need to
>   noDuplicateRowNames <- (is.logical(i) ||
>                           (!is.null(attr(x, "dup.row.names"))) ||
>                           (is.numeric(i) && min(i, 0, na.rm=TRUE) < 0) ||
>                           (!anyMissing(i) && !notSorted(i, strict = TRUE)))
>
>   if (!missing(j)) {
>     x <- x[j]
>     cols <- names(x)
>     if (drop && length(x) == 1L) {
>       if (is.character(i)) {
>         rows <- attr(xx, "row.names")
>         i <- pmatch(i, rows, duplicates.ok = TRUE)
>       }
>       xj <- .subset2(.subset(xx, j), 1L)
>       return(if (length(dim(xj)) != 2L) xj[i] else xj[i,
>                          , drop = FALSE])
>     }
>     if (any(is.na(cols)))
>       stop("undefined columns selected")
>     nxx <- structure(seq_along(xx), names = names(xx))
>     sxx <- match(nxx[j], seq_along(xx))
>   }
>   else sxx <- seq_along(x)
>   rows <- NULL
>   if (is.character(i)) {
>     rows <- attr(xx, "row.names")
>     i <- pmatch(i, rows, duplicates.ok = TRUE)
>   }
>   for (j in seq_along(x)) {
>     xj <- xx[[sxx[j]]]
>     x[[j]] <- if (length(dim(xj)) != 2L)
>       xj[i]
>     else xj[i, , drop = FALSE]
>   }
>   if (drop) {
>     n <- length(x)
>     if (n == 1L)
>       return(x[[1L]])
>     if (n > 1L) {
>       xj <- x[[1L]]
>       nrow <- if (length(dim(xj)) == 2L)
>         dim(xj)[1L]
>       else length(xj)
>       drop <- !mdrop && nrow == 1L
>     }
>     else drop <- FALSE
>   }
>   if (!drop) {
>     if (is.null(rows))
>       rows <- attr(xx, "row.names")
>     rows <- rows[i]
>     if(any(is.na(rows)))
>       rows[is.na(rows)] <- "NA"
>     if(!noDuplicateRowNames && any(duplicated(rows)))
>         rows <- make.unique(as.character(rows))
>     if (any(duplicated(nm <- names(x))))
>       names(x) <- make.unique(nm)
>     if (is.null(rows))
>       rows <- attr(xx, "row.names")[i]
>     attr(x, "row.names") <- rows
>     oldClass(x) <- oldClass(xx)
>   }
>   x
> }
>
> # That requires anyMissing from the splus2R package,
> # plus notSorted (or a version of is.unsorted with argument 'strict'
> added).
>
> (first version of notSorted is omitted)
>
>
> On Tue, Jul 1, 2008 at 11:20 AM, Tim Hesterberg <[EMAIL PROTECTED]>
> wrote:
>
>> Below is a version of [.data.frame that is faster
>> for subscripting rows of large data frames; it avoids calling
>>     duplicated(rows)
>> if there is no need to check for duplicate row names, when:
>>     i is logical
>>     attr(x, "dup.row.names") is not NULL (S+ compatibility)
>>     i is numeric and negative
>>     i is strictly increasing
>>
>
>

        [[alternative HTML version deleted]]

______________________________________________
[email protected] mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

Re: [Rd] [.data.frame speedup

Reply via email to