Search code examples
rfunctioncustomizationsummarys

Edit default summary function in R gives error for multiple variables


I'm expanding the default summary() function because I need more percentiles. It seems to work fine for one variable, but if I add a dataframe containing multiple variables I get strange values whereas with the default summary() it works. Even if I replicate the default summary function completely, so without adding more percentiles, it does not work. I use this line to get the code:

getS3method('summary','default')

-

summary_adj <- function (object, ..., digits = max(3L, getOption("digits") - 
    3L)) 
{
    if (is.factor(object)) 
        return(summary.factor(object, ...))
    else if (is.matrix(object)) 
        return(summary.matrix(object, digits = digits, ...))
    value <- if (is.logical(object)) 
        c(Mode = "logical", {
            tb <- table(object, exclude = NULL)
            if (!is.null(n <- dimnames(tb)[[1L]]) && any(iN <- is.na(n))) dimnames(tb)[[1L]][iN] <- "NA's"
            tb
        })
    else if (is.numeric(object)) {
        nas <- is.na(object)
        object <- object[!nas]
        qq <- stats::quantile(object)
        qq <- signif(c(qq[1L:3L], mean(object), qq[4L:5L]), digits)
        names(qq) <- c("Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", 
            "Max.")
        if (any(nas)) 
            c(qq, `NA's` = sum(nas))
        else qq
    }
    else if (is.recursive(object) && !is.language(object) && 
        (n <- length(object))) {
        sumry <- array("", c(n, 3L), list(names(object), c("Length", 
            "Class", "Mode")))
        ll <- numeric(n)
        for (i in 1L:n) {
            ii <- object[[i]]
            ll[i] <- length(ii)
            cls <- oldClass(ii)
            sumry[i, 2L] <- if (length(cls)) 
                cls[1L]
            else "-none-"
            sumry[i, 3L] <- mode(ii)
        }
        sumry[, 1L] <- format(as.integer(ll))
        sumry
    }
    else c(Length = length(object), Class = class(object), Mode = mode(object))
    class(value) <- c("summaryDefault", "table")
    value
}

Example data set:

nums <- data.frame(var1=rnorm(n=20,mean=5,sd=2),var2=rnorm(n=20,mean=10,sd=4))

-

> summary(nums)
      var1            var2       
 Min.   :1.821   Min.   : 5.095  
 1st Qu.:3.705   1st Qu.: 7.827  
 Median :4.930   Median :10.440  
 Mean   :4.975   Mean   :10.176  
 3rd Qu.:6.553   3rd Qu.:12.247  
 Max.   :7.802   Max.   :14.862  
> summary_adj(nums)
     Length Class  Mode   
var1 20     -none- numeric
var2 20     -none- numeric

But it works for 1 variable:

 > summary_adj(nums$var1)
       Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      1.821   3.705   4.930   4.975   6.553   7.802 

SO it seems not to work on a dataframe with multiple variables.. Any help is much appreciated!

Best regards, Tim

@ Edit on request I add the code I used for the different quantiles:

summary_adj<-function (object, ..., digits = max(3L, getOption("digits") - 
                                                3L)) 
{
  if (is.factor(object)) 
    return(summary.factor(object, ...))
  else if (is.matrix(object)) 
    return(summary.matrix(object, digits = digits, ...))
  value <- if (is.logical(object)) 
    c(Mode = "logical", {
      tb <- table(object, exclude = NULL)
      if (!is.null(n <- dimnames(tb)[[1L]]) && any(iN <- is.na(n))) dimnames(tb)[[1L]][iN] <- "NA's"
      tb
    })
  else if (is.numeric(object)) {
    nas <- is.na(object)
    object <- object[!nas]
    #qq <- stats::quantile(object)
    qq <- stats::quantile(object,c(.05,.25,.5,.75,.95,1))
    qq <- signif(c(qq[1L:3L], mean(object), qq[4L:6L],NROW(object)), digits)
    names(qq) <- c("5th Perc", "25th Perc", "Median","Mean", "75th Perc","95th Perc", 
                   "Max.","obs.")
    if (any(nas)) 
      c(qq, `NA's` = sum(nas))
    else qq
  }
  else if (is.recursive(object) && !is.language(object) && 
             (n <- length(object))) {
    sumry <- array("", c(n, 3L), list(names(object), c("Length", 
                                                       "Class", "Mode")))
    ll <- numeric(n)
    for (i in 1L:n) {
      ii <- object[[i]]
      ll[i] <- length(ii)
      cls <- oldClass(ii)
      sumry[i, 2L] <- if (length(cls)) 
        cls[1L]
      else "-none-"
      sumry[i, 3L] <- mode(ii)
    }
    sumry[, 1L] <- format(as.integer(ll))
    sumry
  }
  else c(Length = length(object), Class = class(object), Mode = mode(object))
  class(value) <- c("summaryDefault", "table")
  value
}

This works for a one variable in my df:

summary_adj(nums$var1)
 5th Perc 25th Perc    Median      Mean 75th Perc 95th Perc      Max.      obs. 
    1.984     3.705     4.930     4.975     6.553     7.491     7.802    20.000 

But not for all:

> summary_adj(nums)
     Length Class  Mode   
var1 20     -none- numeric
var2 20     -none- numeric

whereas it does with the normal summary:

> summary(nums)
      var1            var2       
 Min.   :1.821   Min.   : 5.095  
 1st Qu.:3.705   1st Qu.: 7.827  
 Median :4.930   Median :10.440  
 Mean   :4.975   Mean   :10.176  
 3rd Qu.:6.553   3rd Qu.:12.247  
 Max.   :7.802   Max.   :14.862 

Solution

  • You can define a new function summary_adj.data.frame function using 'getS3method(summary.data.frame)' as a prototype. Note I change the z assignment line with the lapply.

    Call this using summary_adj.data.frame(df) not summary_adj(df). Comments welcome as to how to override the summary_adj for data frames.

    summary_adj.data.frame<- function (object, maxsum = 7L, digits = max(3L, getOption("digits") - 
                                                    3L), ...) 
    {
        ncw <- function(x) {
            z <- nchar(x, type = "w")
            if (any(na <- is.na(z))) {
                z[na] <- nchar(encodeString(z[na]), "b")
            }
            z
        }
        z <- lapply(X = as.list(object), FUN = summary_adj, maxsum = maxsum, 
                    digits = 12L, ...)
        nv <- length(object)
        nm <- names(object)
        lw <- numeric(nv)
        nr <- if (nv) 
            max(unlist(lapply(z, NROW)))
        else 0
        for (i in seq_len(nv)) {
            sms <- z[[i]]
            if (is.matrix(sms)) {
                cn <- paste(nm[i], gsub("^ +", "", colnames(sms), 
                                        useBytes = TRUE), sep = ".")
                tmp <- format(sms)
                if (nrow(sms) < nr) 
                    tmp <- rbind(tmp, matrix("", nr - nrow(sms), 
                                             ncol(sms)))
                sms <- apply(tmp, 1L, function(x) paste(x, collapse = "  "))
                wid <- sapply(tmp[1L, ], nchar, type = "w")
                blanks <- paste(character(max(wid)), collapse = " ")
                wcn <- ncw(cn)
                pad0 <- floor((wid - wcn)/2)
                pad1 <- wid - wcn - pad0
                cn <- paste0(substring(blanks, 1L, pad0), cn, substring(blanks, 
                                                                        1L, pad1))
                nm[i] <- paste(cn, collapse = "  ")
                z[[i]] <- sms
            }
            else {
                sms <- format(sms, digits = digits)
                lbs <- format(names(sms))
                sms <- paste0(lbs, ":", sms, "  ")
                lw[i] <- ncw(lbs[1L])
                length(sms) <- nr
                z[[i]] <- sms
            }
        }
        if (nv) {
            z <- unlist(z, use.names = TRUE)
            dim(z) <- c(nr, nv)
            if (anyNA(lw)) 
                warning("probably wrong encoding in names(.) of column ", 
                        paste(which(is.na(lw)), collapse = ", "))
            blanks <- paste(character(max(lw, na.rm = TRUE) + 2L), 
                            collapse = " ")
            pad <- floor(lw - ncw(nm)/2)
            nm <- paste0(substring(blanks, 1, pad), nm)
            dimnames(z) <- list(rep.int("", nr), nm)
        }
        else {
            z <- character()
            dim(z) <- c(nr, nv)
        }
        attr(z, "class") <- c("table")
        z
    }