Search code examples
rdplyrtidyversecumsum

mutate_at horizontal vs vertical


I am trying to apply a function only on certain columns using mutate_at.

Here the data:

structure(list(LoB = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L), .Label = c("1", "2", "3", "4"), class = "factor"), 
AY = c(1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 
2003, 2004, 2005), R_0 = c(50135, 46530, 38295, 12033, 13332, 
35064, 15695, 41227, 88360, 29500, 30158, 47589), R_1 = c(76631, 
4908, 30427, 4268, 1994, 48426, 4585, 15578, 8112, 30945, 
8141, 11594), R_2 = c(28763, 2634, 374, 0, 216, 0, 555, 0, 
7161, 2192, 0, 772), R_3 = c(0, 1409, 470, 0, 203, 0, 0, 
0, 0, 1556, 0, 675), R_4 = c(16433, 0, 436, 0, 202, 2115, 
0, 0, 0, 1271, 0, 535), R_5 = c(6301, 0, 0, 0, 179, 0, 0, 
0, 183, 1052, 0, 0), R_6 = c(0, 0, 0, 0, 147, 0, 0, 0, 0, 
982, 0, 0), R_7 = c(0, 0, 0, 0, 135, 0, 0, 0, 0, 907, 2356, 
0), R_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 902, 0, 0), R_9 = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 833, 0, 0), R_10 = c(0, 0, 0, 0, 
0, 0, 0, 0, 0, 800, 0, 0), R_11 = c(0, 0, 0, 0, 0, 0, 0, 
0, 0, 684, 0, 0)), row.names = c(NA, -12L), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), vars = "LoB", drop = TRUE, indices = list(
0:11), group_sizes = 12L, biggest_group_size = 12L, labels = structure(list(
LoB = structure(1L, .Label = c("1", "2", "3", "4"), class = "factor")), row.names = c(NA, 
-1L), class = "data.frame", vars = "LoB", drop = TRUE))

which looks like the following:

# A tibble: 12 x 14
# Groups:   LoB [1]
  LoB      AY   R_0   R_1   R_2   R_3   R_4   R_5   R_6   R_7   R_8   R_9  R_10  R_11
  <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 1      1994 50135 76631 28763     0 16433  6301     0     0     0     0     0     0
 2 1      1995 46530  4908  2634  1409     0     0     0     0     0     0     0     0
 3 1      1996 38295 30427   374   470   436     0     0     0     0     0     0     0
 4 1      1997 12033  4268     0     0     0     0     0     0     0     0     0     0
 5 1      1998 13332  1994   216   203   202   179   147   135     0     0     0     0
 6 1      1999 35064 48426     0     0  2115     0     0     0     0     0     0     0
 7 1      2000 15695  4585   555     0     0     0     0     0     0     0     0     0
 8 1      2001 41227 15578     0     0     0     0     0     0     0     0     0     0
 9 1      2002 88360  8112  7161     0     0   183     0     0     0     0     0     0
10 1      2003 29500 30945  2192  1556  1271  1052   982   907   902   833   800   684
11 1      2004 30158  8141     0     0     0     0     0  2356     0     0     0     0
12 1      2005 47589 11594   772   675   535     0     0     0     0     0     0     0

Let's say I would like to create a cumulative sum of the column which starts with R_. In order to do this I wrote:

df %>% mutate_at(vars(contains("R_")), funs(cumsum))

which gives me the following output:

# A tibble: 12 x 14
# Groups:   LoB [1]
   LoB      AY    R_0    R_1   R_2   R_3   R_4   R_5   R_6   R_7   R_8   R_9  R_10  R_11
   <fct> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 1      1994  50135  76631 28763     0 16433  6301     0     0     0     0     0     0
 2 1      1995  96665  81539 31397  1409 16433  6301     0     0     0     0     0     0
 3 1      1996 134960 111966 31771  1879 16869  6301     0     0     0     0     0     0
 4 1      1997 146993 116234 31771  1879 16869  6301     0     0     0     0     0     0
 5 1      1998 160325 118228 31987  2082 17071  6480   147   135     0     0     0     0
 6 1      1999 195389 166654 31987  2082 19186  6480   147   135     0     0     0     0
 7 1      2000 211084 171239 32542  2082 19186  6480   147   135     0     0     0     0
 8 1      2001 252311 186817 32542  2082 19186  6480   147   135     0     0     0     0
 9 1      2002 340671 194929 39703  2082 19186  6663   147   135     0     0     0     0
10 1      2003 370171 225874 41895  3638 20457  7715  1129  1042   902   833   800   684
11 1      2004 400329 234015 41895  3638 20457  7715  1129  3398   902   833   800   684
12 1      2005 447918 245609 42667  4313 20992  7715  1129  3398   902   833   800   684

The problem here is that the cumulative sum has been done vertically (by variable) rather then horizontally. How can I achieve this in dplyr?


Solution

  • Row-wise operations often work better on matrices. To avoid the gather/spread hassle, I would extract the R_ columns, use apply (implicitly converting to a matrix), and then assign the result back to the original data:

    That said, the data doesn't seem very tidy. You might be better off gathering to long format and keeping it long.

    result = dd %>% ungroup %>%
      select(starts_with("R_")) %>%
      apply(1, cumsum) %>% 
      t
    
    dd[, grepl("^R_", names(dd))] = result
    
    dd
    # # A tibble: 12 x 14
    # # Groups:   LoB [1]
    #    LoB      AY   R_0    R_1    R_2    R_3    R_4    R_5    R_6    R_7    R_8    R_9   R_10   R_11
    #    <fct> <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
    #  1 1      1994 50135 126766 155529 155529 171962 178263 178263 178263 178263 178263 178263 178263
    #  2 1      1995 46530  51438  54072  55481  55481  55481  55481  55481  55481  55481  55481  55481
    #  3 1      1996 38295  68722  69096  69566  70002  70002  70002  70002  70002  70002  70002  70002
    #  4 1      1997 12033  16301  16301  16301  16301  16301  16301  16301  16301  16301  16301  16301
    #  5 1      1998 13332  15326  15542  15745  15947  16126  16273  16408  16408  16408  16408  16408
    #  6 1      1999 35064  83490  83490  83490  85605  85605  85605  85605  85605  85605  85605  85605
    #  7 1      2000 15695  20280  20835  20835  20835  20835  20835  20835  20835  20835  20835  20835
    #  8 1      2001 41227  56805  56805  56805  56805  56805  56805  56805  56805  56805  56805  56805
    #  9 1      2002 88360  96472 103633 103633 103633 103816 103816 103816 103816 103816 103816 103816
    # 10 1      2003 29500  60445  62637  64193  65464  66516  67498  68405  69307  70140  70940  71624
    # 11 1      2004 30158  38299  38299  38299  38299  38299  38299  40655  40655  40655  40655  40655
    # 12 1      2005 47589  59183  59955  60630  61165  61165  61165  61165  61165  61165  61165  61165