Search code examples
rlistdataframedplyrmutate

Add lists as variables in a dataframe


I need an efficient way to add a series of variables together, preferably without a loop.

library(dplyr) 
avar_2000  <- c(2,5,1)
avar_2001  <- c(2,3,1)
avar_2002  <- c(7,2,5)
bvar_2000  <- c(9,1,1)
bvar_2001  <- c(5,5,3)
bvar_2002  <- c(3,8,NA)


df <- data.frame(avar_2000,avar_2001,avar_2002,bvar_2000,bvar_2001,bvar_2002)

print(df)
avar_2000 avar_2001 avar_2002 bvar_2000 bvar_2001 bvar_2002
2 2 7 9 5 3
5 3 2 1 5 8
1 1 5 1 3 NA
bvars <- c ("bvar_2000","bvar_2001","bvar_2002")

df2 <- df %>% mutate (avar_sum = avar_2000 + avar_2001 + avar_2002) %>%
  rowwise() %>%
  mutate (bvar_sum = sum(across(all_of(bvars))))

print(df2)
avar_2000 avar_2001 avar_2002 bvar_2000 bvar_2001 bvar_2002 avar_sum bvar_sum
2 2 7 9 5 3 11 17
5 3 2 1 5 8 10 14
1 1 5 1 3 NA 7 NA

Two puzzles:

  1. How can I sum up all the avars and bvars within one line? My project has many of these series, and it would be neat if I could say, "add up all the avars into avars_sum, bvars into bvars_sum, etc.". Is that possible?

  2. How can I add avars and bvars together within their respective years into a new series? As in, avar2000 + bvar2000 = cvar2000. Without writing it out and without writing a loop.

I wrote a loop and it works, but it's... well, there's a better way, right?

for (b in 2000:2003) {
  avar = paste0("avar_",b)
  bvar = paste0("bvar_",b)
  cvar = paste0("cvar_",b)
  
  df2<- df2%>% 
    mutate(
      !!cvar := get(avar) + get(bvar))   
}

Solution

  • If the columns are in the same order, we could use

    library(dplyr)
    library(stringr)
    df %>% 
      mutate(across(starts_with('avar'), 
       .names = "{str_replace(.col, 'avar', 'cvar')}") + 
          across(starts_with('bvar')))
    

    -output

     avar_2000 avar_2001 avar_2002 bvar_2000 bvar_2001 bvar_2002 cvar_2000 cvar_2001 cvar_2002
    1         2         2         7         9         5         3        11         7        10
    2         5         3         2         1         5         8         6         8        10
    3         1         1         5         1         3        NA         2         4        NA
    

    As mentioned by @M--, if we want to do both the operations

    df %>% 
      mutate(avar_sum = rowSums(pick(starts_with('avar')), 
       na.rm = TRUE), bvar_sum = rowSums(pick(starts_with('bvar')), 
            na.rm = TRUE)) %>% 
       mutate(across(c(starts_with('avar'), -ends_with('sum')), 
       .names = "{str_replace(.col, 'avar', 'cvar')}") + 
          across(c(starts_with('bvar'), -ends_with('sum'))))
    

    -output

     avar_2000 avar_2001 avar_2002 bvar_2000 bvar_2001 bvar_2002 avar_sum bvar_sum cvar_2000 cvar_2001 cvar_2002
    1         2         2         7         9         5         3       11       17        11         7        10
    2         5         3         2         1         5         8       10       14         6         8        10
    3         1         1         5         1         3        NA        7        4         2         4        NA
    

    Or another option with dplyover

    library(dplyover)
    df %>%
       mutate(across2(starts_with('avar'), 
        starts_with('bvar'), ~ .x + .y, .names_fn = 
          ~ str_replace(.x, "avar_(\\d+)_.*", "cvar_\\1")))
    

    -output

    avar_2000 avar_2001 avar_2002 bvar_2000 bvar_2001 bvar_2002 cvar_2000 cvar_2001 cvar_2002
    1         2         2         7         9         5         3        11         7        10
    2         5         3         2         1         5         8         6         8        10
    3         1         1         5         1         3        NA         2         4        NA
    

    Or in base R, use split.default

    df[paste0("cvar_", 2000:2002)] <- lapply(split.default(df, 
          trimws(names(df), whitespace = ".*_")), rowSums)