Search code examples
rfunctiongrouping

Trying to map out summary statistics of multiple variables from a written function, while splitting variables by a grouping variable


Here is the data:

df<-structure(list(Subject.ID = c(168236L, 168236L, 168236L, 168236L, 
168236L, 168236L, 168236L, 168236L, 168236L, 168695L, 168695L, 
168695L, 168695L, 168695L, 168695L, 168695L, 168695L, 168695L, 
168695L, 168695L, 168695L, 168695L, 168695L, 168695L, 168695L, 
168695L, 168695L, 168695L, 168695L, 168695L, 168695L, 168695L, 
168695L, 168695L, 168695L, 168695L, 168695L, 168695L, 168695L, 
168696L, 168696L, 168696L, 168696L, 168696L, 168696L, 168696L, 
168696L, 168696L, 168696L, 168696L, 168696L, 168696L, 168696L, 
168696L, 168696L, 168696L, 168696L, 168696L, 168696L, 168914L, 
168914L, 168914L, 168914L, 168914L, 168914L, 168914L, 168914L, 
168914L, 169242L, 169242L, 169242L, 169242L, 169242L, 169242L, 
169242L, 169242L, 169242L, 169242L, 169242L, 169242L, 169242L, 
169242L, 169242L, 169242L, 169242L, 169242L, 169242L, 169242L, 
169242L, 169242L, 169242L, 169242L, 169242L, 169242L, 169242L, 
169242L, 169242L, 169242L, 169728L, 169728L, 169728L, 169728L, 
169728L, 169728L, 169728L, 169728L, 169728L, 169728L, 169728L, 
169728L, 169728L, 169728L, 169728L, 169728L, 169728L, 169728L, 
169728L, 169728L, 170992L, 170992L, 170992L, 170992L, 170992L, 
170992L, 170992L, 170992L, 170992L, 170992L, 170992L, 170992L, 
170992L, 170992L, 170992L, 170992L, 170992L, 170992L, 170992L, 
170992L, 170992L, 170992L, 170992L, 170992L, 170992L, 170992L, 
170992L, 170992L, 170992L, 170992L, 172482L, 172482L, 172482L, 
172482L, 172482L, 172482L, 172482L, 172482L, 172482L, 172482L, 
172482L, 172482L, 172482L, 172482L, 172482L, 172482L, 172482L, 
172482L, 172482L, 172482L, 172483L, 172483L, 172483L, 172483L, 
172483L, 172483L, 172483L, 172483L, 172483L, 172483L, 172483L, 
172483L, 172483L, 172483L, 172483L, 172483L, 172483L, 172483L, 
172483L, 172483L, 172483L, 172483L, 172483L, 172483L, 172483L, 
172483L, 172483L, 172483L, 172483L, 172483L, 172490L, 172490L, 
172490L, 172490L, 172490L, 172490L, 172490L, 172490L, 172490L, 
172490L, 172490L, 172490L, 172490L, 172490L, 172490L, 172490L, 
172490L, 172490L, 172490L, 172490L, 172490L, 172490L, 172490L, 
172490L, 172490L, 172490L, 172490L, 172490L, 172490L, 172490L
), Phase = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), levels = c("Run-in", 
"Control", "TRE"), class = "factor"), FastingGlucose = c(83, 
NA, NA, NA, NA, NA, NA, NA, NA, 96, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, 91, NA, NA, NA, NA, NA, NA, NA, NA, NA, 96, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, 99, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
98, NA, NA, NA, NA, NA, NA, NA, NA, NA, 95, NA, NA, NA, NA, NA, 
NA, NA, NA, 84, NA, NA, NA, NA, NA, NA, NA, NA, NA, 88, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, 93, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, 90, NA, NA, NA, NA, NA, NA, NA, NA, NA, 93, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, 102, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
101, NA, NA, NA, NA, NA, NA, NA, NA, NA, 106, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, 87, NA, NA, NA, NA, NA, NA, NA, NA, NA, 86, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, 122, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, 107, NA, NA, NA, NA, NA, NA, NA, NA, NA, 111, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, 100, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, 104, NA, NA, NA, NA, NA, NA, NA, NA, NA, 98, 
NA, NA, NA, NA, NA, NA, NA, NA, NA), Chol = c(NA, 137, NA, NA, 
NA, NA, NA, NA, NA, NA, 149, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, 149, NA, NA, NA, NA, NA, NA, NA, NA, NA, 140, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, 203, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, 181, NA, NA, NA, NA, NA, NA, NA, NA, NA, 274, NA, NA, NA, 
NA, NA, NA, NA, NA, 141, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
123, NA, NA, NA, NA, NA, NA, NA, NA, NA, 137, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, 163, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
152, NA, NA, NA, NA, NA, NA, NA, NA, NA, 177, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, 172, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
167, NA, NA, NA, NA, NA, NA, NA, NA, NA, 175, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, 165, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
207, NA, NA, NA, NA, NA, NA, NA, NA, NA, 188, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, 201, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
186, NA, NA, NA, NA, NA, NA, NA, NA, NA, 188, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, 200, NA, NA, NA, NA, NA, NA, NA, NA), Trig = c(NA, 
NA, 68, NA, NA, NA, NA, NA, NA, NA, NA, 96, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, 80, NA, NA, NA, NA, NA, NA, NA, NA, NA, 59, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, 68, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, 80, NA, NA, NA, NA, NA, NA, NA, NA, NA, 228, NA, NA, 
NA, NA, NA, NA, NA, NA, 119, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, 82, NA, NA, NA, NA, NA, NA, NA, NA, NA, 79, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, 77, NA, NA, NA, NA, NA, NA, NA, NA, NA, 60, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, 126, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, 83, NA, NA, NA, NA, NA, NA, NA, NA, NA, 90, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, 127, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, 112, NA, NA, NA, NA, NA, NA, NA, NA, NA, 144, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, 104, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, 94, NA, NA, NA, NA, NA, NA, NA, NA, NA, 247, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, 403, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, 344, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, 
228L), class = "data.frame")

Here is an example of when the type of mapping I am trying to do works:

#Creating an object with the biomarkers names in it
## Find first biomarker column
bio.start <- which(colnames(df) == "FastingGlucose"); bio.start #Note in R, two "==" 
signs are used to denote equals to
## Find last biomarker column
bio.stop <- which(colnames(df) == "Trig"); bio.stop
#select all biomarker column names between the first and last biomarker
bio <- colnames(df)[bio.start:bio.stop]

df[bio] %>% split(dfsub$Phase) %>% map(summary)

It shows all 3 variables 3 times (once for each Phase- the grouping variable).

However, when I try to do something similar with a written function instead of summary, it doesn't map things the same way

Here is the function:

summarystat <- function(x, na.rm = TRUE) {
#it takes two areguments x, and na.rm
if(na.rm){ #if na.rm is false it means no need to remove NA values. Default is TRUE
x = x[!is.na(x)]
}
my_list <- list("mean" = mean(x), "median" = median(x), "sd" = sd(x))
return(my_list) 
}

I try to use it in a similar way:

df[bio] %>% split(dfsub$Phase) %>% map(summarystat)

But instead of returning 9 SETS of summary statistics (as summary does- 3 variables with 3 grouping categories- 3x3), it only returns 3 sets. No mention of the variable names. How can I alter my function to return things in a similar way


Solution

  • Your x is a dataframe. This means it actually compute statistics across all values in x. We may use lapply to avoid this:

    summarystat <- function(x, na.rm = TRUE) {
      #it takes two areguments x, and na.rm
      my_list <- lapply(as.list(x),function(v) setNames(c(mean(v,na.rm=na.rm),median(v,na.rm=na.rm),sd(v,na.rm=na.rm)),c("mean","median","sd")))
      return(my_list) 
    }
    
    df[bio] %>% split(df$Phase) %>% map(summarystat)
    $`Run-in`
    $`Run-in`$FastingGlucose
        mean   median       sd 
    97.00000 97.50000 12.40967 
    
    $`Run-in`$Chol
         mean    median        sd 
    170.37500 170.00000  27.20786 
    
    $`Run-in`$Trig
         mean    median        sd 
    118.12500 107.50000  59.21737 
    
    
    $Control
    $Control$FastingGlucose
         mean    median        sd 
    97.857143 96.000000  7.946248 
    
    $Control$Chol
         mean    median        sd 
    162.42857 165.00000  20.79148 
    
    $Control$Trig
        mean   median       sd 
    129.5714  90.0000 122.2495 
    
    
    $TRE
    $TRE$FastingGlucose
         mean    median        sd 
    96.125000 96.500000  7.827379 
    
    $TRE$Chol
         mean    median        sd 
    184.37500 178.00000  44.43916 
    
    $TRE$Trig
         mean    median        sd 
    139.75000  88.50000  96.70094
    

    Or we can modify the code in summary.data.frame:

    mysummary2 <- function (object, na.rm=TRUE,...) 
    {
      
      maxsum = 7L 
      digits = max(3L, getOption("digits") - 
                                  3L)
      
      ncw <- function(x) {
        z <- nchar(x, type = "w", allowNA = TRUE)
        if (any(na <- is.na(z))) {
          z[na] <- nchar(encodeString(z[na]), "b")
        }
        z
      }
      z <- lapply(X = as.list(object), FUN = function(v) setNames(c(mean(v,na.rm=na.rm),median(v,na.rm=na.rm),sd(v,na.rm=na.rm)),c("mean","median","sd")), ...)
      nv <- length(object)
      nm <- names(object)
      lw <- numeric(nv)
      nr <- if (nv) 
        max(vapply(z, function(x) NROW(x) + !is.null(attr(x, 
                                                          "NAs")), integer(1)))
      else 0
      for (i in seq_len(nv)) {
        sms <- z[[i]]
        if (is.matrix(sms)) {
          cn <- paste(nm[i], gsub("^ +", "", colnames(sms), 
                                  useBytes = TRUE), sep = ".")
          tmp <- format(sms)
          if (nrow(sms) < nr) 
            tmp <- rbind(tmp, matrix("", nr - nrow(sms), 
                                     ncol(sms)))
          sms <- apply(tmp, 1L, function(x) paste(x, collapse = "  "))
          wid <- sapply(tmp[1L, ], nchar, type = "w")
          blanks <- paste(character(max(wid)), collapse = " ")
          wcn <- ncw(cn)
          pad0 <- floor((wid - wcn)/2)
          pad1 <- wid - wcn - pad0
          cn <- paste0(substring(blanks, 1L, pad0), cn, substring(blanks, 
                                                                  1L, pad1))
          nm[i] <- paste(cn, collapse = "  ")
        }
        else {
          sms <- format(sms, digits = digits)
          lbs <- format(names(sms))
          sms <- paste0(lbs, ":", sms, "  ")
          lw[i] <- ncw(lbs[1L])
          length(sms) <- nr
        }
        z[[i]] <- sms
      }
      if (nv) {
        z <- unlist(z, use.names = TRUE)
        dim(z) <- c(nr, nv)
        if (anyNA(lw)) 
          warning("probably wrong encoding in names(.) of column ", 
                  paste(which(is.na(lw)), collapse = ", "))
        blanks <- paste(character(max(lw, na.rm = TRUE) + 2L), 
                        collapse = " ")
        pad <- floor(lw - ncw(nm)/2)
        nm <- paste0(substring(blanks, 1, pad), nm)
        dimnames(z) <- list(rep.int("", nr), nm)
      }
      else {
        z <- character()
        dim(z) <- c(nr, nv)
      }
      attr(z, "class") <- c("table")
      z
    }
    
    df[bio] %>% split(df$Phase) %>% map(mysummary2)
    $`Run-in`
     FastingGlucose     Chol            Trig       
     mean  :97.00   mean  :170.38   mean  :118.12  
     median:97.50   median:170.00   median:107.50  
     sd    :12.41   sd    : 27.21   sd    : 59.22  
    
    $Control
     FastingGlucose      Chol            Trig      
     mean  :97.857   mean  :162.43   mean  :129.6  
     median:96.000   median:165.00   median: 90.0  
     sd    : 7.946   sd    : 20.79   sd    :122.2  
    
    $TRE
     FastingGlucose      Chol            Trig      
     mean  :96.125   mean  :184.38   mean  :139.8  
     median:96.500   median:178.00   median: 88.5  
     sd    : 7.827   sd    : 44.44   sd    : 96.7