I'm expanding the default summary() function because I need more percentiles. It seems to work fine for one variable, but if I add a dataframe containing multiple variables I get strange values whereas with the default summary() it works. Even if I replicate the default summary function completely, so without adding more percentiles, it does not work. I use this line to get the code:
getS3method('summary','default')
-
summary_adj <- function (object, ..., digits = max(3L, getOption("digits") -
3L))
{
if (is.factor(object))
return(summary.factor(object, ...))
else if (is.matrix(object))
return(summary.matrix(object, digits = digits, ...))
value <- if (is.logical(object))
c(Mode = "logical", {
tb <- table(object, exclude = NULL)
if (!is.null(n <- dimnames(tb)[[1L]]) && any(iN <- is.na(n))) dimnames(tb)[[1L]][iN] <- "NA's"
tb
})
else if (is.numeric(object)) {
nas <- is.na(object)
object <- object[!nas]
qq <- stats::quantile(object)
qq <- signif(c(qq[1L:3L], mean(object), qq[4L:5L]), digits)
names(qq) <- c("Min.", "1st Qu.", "Median", "Mean", "3rd Qu.",
"Max.")
if (any(nas))
c(qq, `NA's` = sum(nas))
else qq
}
else if (is.recursive(object) && !is.language(object) &&
(n <- length(object))) {
sumry <- array("", c(n, 3L), list(names(object), c("Length",
"Class", "Mode")))
ll <- numeric(n)
for (i in 1L:n) {
ii <- object[[i]]
ll[i] <- length(ii)
cls <- oldClass(ii)
sumry[i, 2L] <- if (length(cls))
cls[1L]
else "-none-"
sumry[i, 3L] <- mode(ii)
}
sumry[, 1L] <- format(as.integer(ll))
sumry
}
else c(Length = length(object), Class = class(object), Mode = mode(object))
class(value) <- c("summaryDefault", "table")
value
}
Example data set:
nums <- data.frame(var1=rnorm(n=20,mean=5,sd=2),var2=rnorm(n=20,mean=10,sd=4))
-
> summary(nums)
var1 var2
Min. :1.821 Min. : 5.095
1st Qu.:3.705 1st Qu.: 7.827
Median :4.930 Median :10.440
Mean :4.975 Mean :10.176
3rd Qu.:6.553 3rd Qu.:12.247
Max. :7.802 Max. :14.862
> summary_adj(nums)
Length Class Mode
var1 20 -none- numeric
var2 20 -none- numeric
But it works for 1 variable:
> summary_adj(nums$var1)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.821 3.705 4.930 4.975 6.553 7.802
SO it seems not to work on a dataframe with multiple variables.. Any help is much appreciated!
Best regards, Tim
@ Edit on request I add the code I used for the different quantiles:
summary_adj<-function (object, ..., digits = max(3L, getOption("digits") -
3L))
{
if (is.factor(object))
return(summary.factor(object, ...))
else if (is.matrix(object))
return(summary.matrix(object, digits = digits, ...))
value <- if (is.logical(object))
c(Mode = "logical", {
tb <- table(object, exclude = NULL)
if (!is.null(n <- dimnames(tb)[[1L]]) && any(iN <- is.na(n))) dimnames(tb)[[1L]][iN] <- "NA's"
tb
})
else if (is.numeric(object)) {
nas <- is.na(object)
object <- object[!nas]
#qq <- stats::quantile(object)
qq <- stats::quantile(object,c(.05,.25,.5,.75,.95,1))
qq <- signif(c(qq[1L:3L], mean(object), qq[4L:6L],NROW(object)), digits)
names(qq) <- c("5th Perc", "25th Perc", "Median","Mean", "75th Perc","95th Perc",
"Max.","obs.")
if (any(nas))
c(qq, `NA's` = sum(nas))
else qq
}
else if (is.recursive(object) && !is.language(object) &&
(n <- length(object))) {
sumry <- array("", c(n, 3L), list(names(object), c("Length",
"Class", "Mode")))
ll <- numeric(n)
for (i in 1L:n) {
ii <- object[[i]]
ll[i] <- length(ii)
cls <- oldClass(ii)
sumry[i, 2L] <- if (length(cls))
cls[1L]
else "-none-"
sumry[i, 3L] <- mode(ii)
}
sumry[, 1L] <- format(as.integer(ll))
sumry
}
else c(Length = length(object), Class = class(object), Mode = mode(object))
class(value) <- c("summaryDefault", "table")
value
}
This works for a one variable in my df:
summary_adj(nums$var1)
5th Perc 25th Perc Median Mean 75th Perc 95th Perc Max. obs.
1.984 3.705 4.930 4.975 6.553 7.491 7.802 20.000
But not for all:
> summary_adj(nums)
Length Class Mode
var1 20 -none- numeric
var2 20 -none- numeric
whereas it does with the normal summary:
> summary(nums)
var1 var2
Min. :1.821 Min. : 5.095
1st Qu.:3.705 1st Qu.: 7.827
Median :4.930 Median :10.440
Mean :4.975 Mean :10.176
3rd Qu.:6.553 3rd Qu.:12.247
Max. :7.802 Max. :14.862
You can define a new function summary_adj.data.frame
function using 'getS3method(summary.data.frame)' as a prototype. Note I change the z
assignment line with the lapply
.
Call this using summary_adj.data.frame(df)
not summary_adj(df)
. Comments welcome as to how to override the summary_adj
for data frames.
summary_adj.data.frame<- function (object, maxsum = 7L, digits = max(3L, getOption("digits") -
3L), ...)
{
ncw <- function(x) {
z <- nchar(x, type = "w")
if (any(na <- is.na(z))) {
z[na] <- nchar(encodeString(z[na]), "b")
}
z
}
z <- lapply(X = as.list(object), FUN = summary_adj, maxsum = maxsum,
digits = 12L, ...)
nv <- length(object)
nm <- names(object)
lw <- numeric(nv)
nr <- if (nv)
max(unlist(lapply(z, NROW)))
else 0
for (i in seq_len(nv)) {
sms <- z[[i]]
if (is.matrix(sms)) {
cn <- paste(nm[i], gsub("^ +", "", colnames(sms),
useBytes = TRUE), sep = ".")
tmp <- format(sms)
if (nrow(sms) < nr)
tmp <- rbind(tmp, matrix("", nr - nrow(sms),
ncol(sms)))
sms <- apply(tmp, 1L, function(x) paste(x, collapse = " "))
wid <- sapply(tmp[1L, ], nchar, type = "w")
blanks <- paste(character(max(wid)), collapse = " ")
wcn <- ncw(cn)
pad0 <- floor((wid - wcn)/2)
pad1 <- wid - wcn - pad0
cn <- paste0(substring(blanks, 1L, pad0), cn, substring(blanks,
1L, pad1))
nm[i] <- paste(cn, collapse = " ")
z[[i]] <- sms
}
else {
sms <- format(sms, digits = digits)
lbs <- format(names(sms))
sms <- paste0(lbs, ":", sms, " ")
lw[i] <- ncw(lbs[1L])
length(sms) <- nr
z[[i]] <- sms
}
}
if (nv) {
z <- unlist(z, use.names = TRUE)
dim(z) <- c(nr, nv)
if (anyNA(lw))
warning("probably wrong encoding in names(.) of column ",
paste(which(is.na(lw)), collapse = ", "))
blanks <- paste(character(max(lw, na.rm = TRUE) + 2L),
collapse = " ")
pad <- floor(lw - ncw(nm)/2)
nm <- paste0(substring(blanks, 1, pad), nm)
dimnames(z) <- list(rep.int("", nr), nm)
}
else {
z <- character()
dim(z) <- c(nr, nv)
}
attr(z, "class") <- c("table")
z
}