Search code examples
rdplyrapplysapply

Wide format: a function to calculate row means for specific batches of columns, then scale up for multiple batches


This is a followup question to a previous post of mine about building a function for calculating row means.

I want to use any function of the apply family to iterate over my dataset and each time compute the row mean (which is what the function does) for a group of columns I specify. Unfortunately, I miss something critical in the way I should tweak apply(), because I get an error that I can't troubleshoot.

Example Data

capital_cities_df <-
 data.frame("europe_paris" = 1:10, 
           "europe_london" = 11:20, 
           "europe_rome" = 21:30,
           "asia_bangkok" = 31:40,
           "asia_tokyo" = 41:50,
           "asia_kathmandu" = 51:60)

set.seed(123)
capital_cities_df <- as.data.frame(lapply(capital_cities_df, 
function(cc) cc[ sample(c(TRUE, NA),
                         prob = c(0.70, 0.30),
                         size = length(cc), 
                         replace = TRUE) ]))

> capital_cities_df

   europe_paris europe_london europe_rome asia_bangkok asia_tokyo asia_kathmandu
1             1            NA          NA           NA         41             NA
2            NA            12          22           NA         42             52
3             3            NA          23           33         43             NA
4            NA            14          NA           NA         NA             NA
5            NA            15          25           35         45             NA
6             6            NA          NA           36         NA             56
7            NA            17          NA           NA         NA             57
8            NA            18          NA           38         48             NA
9            NA            19          NA           39         49             NA
10           10            NA          30           40         NA             60

Custom Function

library(dplyr)
library(rlang)

continent_mean <- function(df, continent)  {
  df %>%
    select(starts_with(continent)) %>%
    dplyr::mutate(!!quo_name(continent) := rowMeans(., na.rm = TRUE))
}

## works for a single case:
continent_mean(capital_cities_df, "europe")

   europe_paris europe_london europe_rome europe
1             1            NA          21     11
2             2            12          22     12
3             3            NA          23     13
4             4            14          NA      9
5            NA            15          25     20
6             6            16          26     16
7            NA            17          NA     17
8            NA            18          NA     18
9            NA            19          NA     19
10           10            20          30     20

Trying to apply the function over the data, unsuccessfully

apply(
  capital_cities_df,
  MARGIN = 2,
  FUN = continent_mean(capital_cities_df, continent = "europe")
)

Error in match.fun(FUN) : 
  'continent_mean(capital_cities_df, continent = "europe")' is not a function, character or symbol

Any other combination of the arguments in apply() didn't work either, nor did sapply. This unsuccessful attempt of using apply is only for one type of columns I wish to get the mean for ("europe"). However, my ultimate goal is to be able to pass c("europe", "asia", etc.) with apply, so I could get the custom function to create row means columns for all groups of columns I specify, in one hit.

What is wrong with my code?

Thanks!

EDIT 19-AUG-2019

I was trying the solution suggested by A. Suliman (see below). It did work for the example data I posted here, but not when trying to scale it up to my real dataset, where I need to subset additional columns (rather than the "continent" batch only). More specifically, in my real data I have an ID column which I want to get outputted along the other data, when I apply my custom-made function.

Example data including "ID" column

capital_cities_df <- data.frame(
    "europe_paris" = 1:10,
    "europe_london" = 11:20,
    "europe_rome" = 21:30,
    "asia_bangkok" = 31:40,
    "asia_tokyo" = 41:50,
    "asia_kathmandu" = 51:60)
  
set.seed(123)
capital_cities_df <- as.data.frame(lapply(df, function(cc) cc[ sample(c(TRUE, NA),
                                                 prob = c(0.70, 0.30),
                                                 size = length(cc), 
                                                 replace = TRUE) ]))

id <- 1:10
capital_cities_df <- cbind(id, capital_cities_df)

> capital_cities_df

   id europe_paris europe_london europe_rome asia_bangkok asia_tokyo asia_kathmandu
1   1            1            NA          NA           NA         41             NA
2   2           NA            12          22           NA         42             52
3   3            3            NA          23           33         43             NA
4   4           NA            14          NA           NA         NA             NA
5   5           NA            15          25           35         45             NA
6   6            6            NA          NA           36         NA             56
7   7           NA            17          NA           NA         NA             57
8   8           NA            18          NA           38         48             NA
9   9           NA            19          NA           39         49             NA
10 10           10            NA          30           40         NA             60

My function (edited to select id as well)

continent_mean <- function(df, continent)  {
  df %>%
    select(., id, starts_with(continent)) %>%
    dplyr::mutate(!!quo_name(continent) := rowMeans(., na.rm = TRUE))
}

> continent_mean(capital_cities_df, "europe") ## works in a single run

   id europe_paris europe_london europe_rome    europe
1   1            1            NA          NA  1.000000
2   2           NA            12          22 12.000000
3   3            3            NA          23  9.666667
4   4           NA            14          NA  9.000000
5   5           NA            15          25 15.000000
6   6            6            NA          NA  6.000000
7   7           NA            17          NA 12.000000
8   8           NA            18          NA 13.000000
9   9           NA            19          NA 14.000000
10 10           10            NA          30 16.666667

Trying to apply the function beyond the single use (based on A. Suliman's method) -- unsuccessfully

continents <- c("europe", "asia") 
lst <- lapply(continents, function(x) continent_mean(df=capital_cities_df[, grep(x, names(capital_cities_df))], continent=x))

## or:
purrr::map_dfc(continents, ~continent_mean(df=capital_cities_df[, grep(.x, names(capital_cities_df))], continent=.x))

In either case I get a variety of error messages:

Error in inds_combine(.vars, ind_list) : Position must be between 0 and n

At other times:

Error: invalid column index : NA for variable: 'NA' = 'NA'

All I wanted was a simple function to let me calculate row means per specification of which columns to run over, but this gets nasty for some reason. Even though I'm eager to figure out what's wrong with my code, if anybody has a better overarching solution for the entire process I'd be thankful too.

Thanks!


Solution

  • Use lapply to loop through continents then use grep to select columns with the current continent

    continents <- c("europe", "asia") 
    lst <- lapply(continents, function(x) continent_mean(df=capital_cities_df[, grep(x, names(capital_cities_df))], continent=x))
    #To a dataframe not a list
    do.call(cbind, lst)
    

    Using map_dfc from purrr we can get the result in one step

    purrr::map_dfc(continents, ~continent_mean(df=capital_cities_df[, grep(.x, names(capital_cities_df))], continent=.x))
    

    Update:

    #grep will return column positions when they match with "europe" or "asia", e.g
    > grep("europe", names(capital_cities_df))
    [1] 2 3 4
    #If we need the column names then we add value=TRUE to grep 
    > grep("europe", names(capital_cities_df), value = TRUE)
    [1] "europe_paris"  "europe_london" "europe_rome" 
    

    So to add a new column we can just use the c() function and call the function as usual

    #NOTE: Here I'm using the old function without select
    lst <- lapply(continents, function(x) continent_mean(df=capital_cities_df[, c('id',grep(x, names(capital_cities_df), value = TRUE))], continent=x))
    do.call(cbind, lst)
    id europe_paris europe_london europe_rome   europe id asia_bangkok asia_tokyo asia_kathmandu     asia
    1   1            1            NA          NA  1.00000  1           NA         41             51 31.00000
    2   2           NA            12          22 12.00000  2           NA         42             52 32.00000
    3   3            3            13          23 10.50000  3           33         43             NA 26.33333
    4   4           NA            14          NA  9.00000  4           NA         44             54 34.00000
    5   5           NA            15          25 15.00000  5           35         45             55 35.00000
    6   6            6            NA          NA  6.00000  6           36         46             56 36.00000
    7   7            7            17          27 14.50000  7           NA         47             57 37.00000
    8   8           NA            18          28 18.00000  8           38         48             NA 31.33333
    9   9            9            19          29 16.50000  9           39         49             NA 32.33333
    10 10           10            NA          30 16.66667 10           40         NA             60 36.66667
    
    #We have one problem, id column gets duplicated, map_dfc with select will solve this issue
    purrr::map_dfc(continents, ~continent_mean(df=capital_cities_df[, c('id',grep(.x, names(capital_cities_df), value = TRUE))], continent=.x)) %>%
    #Don't select any column name ends with id followed by one digit
    select(-matches('id\\d')) 
    

    If you'd like to use the new function with select then just pass capital_cities_df without grep, e.g using map_dfc

    purrr::map_dfc(continents, ~continent_mean(df=capital_cities_df, continent=.x)) %>% 
    select(-matches('id\\d'))
    

    Correction: in continent_mean

    continent_mean <- function(df, continent)  {
      df %>%
        select(., id, starts_with(continent)) %>%
        #Exclude id from the rowMeans calculation 
        dplyr::mutate(!!quo_name(continent) := rowMeans(.[grep(continent, names(.))], na.rm = TRUE))
    }