Search code examples
rdplyrdynamiccalculation

Basic calculations by groups in a dynamic way in R


I have a dataset like df with dozens of thousands of observations and +100 categories (in the categorical variables).

This data contains info about different individuals (id) bringing different type of patients to different places for a given price and year.

year <- c(2010, 2010, 2010, 2010, 2011, 2011, 2011, 2010, 2011)
id <- c("A", "A" , "A" , "A" , "A" , "A" , "A", "B", "B")
type <- c("kid", "kid", "adult", "kid", "kid", "dog", "cat", "kid", "kid")
place <- c("hosp", "hosp", "house", "hosp", "hosp", "hosp", "house", "hosp", "hosp")
price <- c(2, 3, 6, 5, 1, 2, 3, 4, 5)

df <- data.frame(year, id, type, place, price)

I want to do some basic calculations on df (basically summary statistics) by groups (id-year) in the following manner:

  1. create an experience variable by type of patient (assign values according to the number of years that this id has this type)
  2. create an experience variable by place (assign values according to the number of years that this id has this place)
  3. average price per visit in this given year for each id
  4. will the id appear again the following year (t+1)?: this variable can only take values 0 (no), 1 (yes).

So I can get something like df_new:

year <- c("2010", "2011", "2010", "2011")
id <- c("A", "A", "B", "B")
exp_type_kid <- c(1, 2, 1, 2)
exp_type_adult <- c(1, 1, 0, 0)
exp_type_dog <- c(0, 1, 0, 0)
exp_type_cat <- c(0, 1, 0, 0)
exp_place_hosp <- c(1, 2, 1, 2)
exp_place_house <- c(1, 2, 0, 0)
avg_price <- c(4, 2, 4, 5)
id_repeat_next_year <- c(1, 0, 1, 0)

df_new <- data.frame(year, id, exp_type_kid, exp_type_adult, exp_type_dog, exp_type_cat,
                     exp_place_hosp, exp_place_house, avg_price, id_repeat_next_year)

EDIT

My dataset can contain much more years, as in the following example:

year <- c(2010, 2010, 2010, 2010, 2011, 2011, 2011, 2009, 2010, 2015, 2017)
id <- c("A", "A" , "A" , "A" , "A" , "A" , "A", "B", "B", "B", "B")
type <- c("kid", "kid", "adult", "kid", "kid", "dog", "cat", "kid", "kid", "kid", "kid")
place <- c("hosp", "hosp", "house", "hosp", "hosp", "hosp", "house", "hosp", "hosp", "hosp", "hosp")
price <- c(2, 3, 6, 5, 1, 2, 3, 4, 4, 4, 4)

df <- data.frame(year, id, type, place, price)

Solution

  • I modified the year to make it numeric.

    year <- c(2010, 2010, 2010, 2010, 2011, 2011, 2011)
    id <- c("A", "A" , "A" , "A" , "A" , "A" , "A")
    type <- c("kid", "kid", "adult", "kid", "kid", "dog", "cat")
    place <- c("hosp", "hosp", "house", "hosp", "hosp", "hosp", "house")
    price <- c(2, 3, 6, 5, 1, 2, 3)
    
    df <- data.frame(year, id, type, place, price)
    
    df
    
    ##   year id  type place price
    ## 1 2010  A   kid  hosp     2
    ## 2 2010  A   kid  hosp     3
    ## 3 2010  A adult house     6
    ## 4 2010  A   kid  hosp     5
    ## 5 2011  A   kid  hosp     1
    ## 6 2011  A   dog  hosp     2
    ## 7 2011  A   cat house     3
    

    I see three different tasks. Points 1 and 2 are about counting, point 3 about aggregating and point 4 satisfying a condition. I think each task must be addressed individually and hopefully we can merge the results into the desired data.frame.

    Counting experience

    For a single factor we can use table() to get the counts by year:

    with(df, table(year, type))
    
    ##       type
    ## year   adult cat dog kid
    ##   2010     1   0   0   3
    ##   2011     0   1   1   1
    

    Now we only care about wether there is an ocurrence of each type for any given year:

    with(df, table(year, type) > 0)
    
    ##       type
    ## year   adult   cat   dog  kid
    ##   2010  TRUE FALSE FALSE TRUE
    ##   2011 FALSE  TRUE  TRUE TRUE
    

    From here we can cumsum() by columns to get the years of experience, and easily turn that into a data.frame:

    # type
    with(df, table(year, type) > 0) |> apply(2, cumsum) |> as.data.frame()
    
    ##      adult cat dog kid
    ## 2010     1   0   0   1
    ## 2011     1   1   1   2
    
    # place
    with(df, table(year, place) > 0) |> apply(2, cumsum) |> as.data.frame()
    
    ##      hosp house
    ## 2010    1     1
    ## 2011    2     2
    

    Let's turn this into a function to make things easier to follow:

    experience_by = function (df, what) {
      out = with(df, table(year, get(what)) > 0) |> apply(2, cumsum) |> as.data.frame()
      names(out) = paste('exp', what, names(out), sep = '_')
      return(out)
    }
    
    experience_by(df, 'type')
    
    ##      exp_type_adult exp_type_cat exp_type_dog exp_type_kid
    ## 2010              1            0            0            1
    ## 2011              1            1            1            2
    
    experience_by(df, 'place')
    
    ##      exp_place_hosp exp_place_house
    ## 2010              1               1
    ## 2011              2               2
    

    We are missing the year variable here (they are rownames now) but that will get solved in the next step.

    Aggregating prices

    This is the easy part:

    aggregate(price ~ year, FUN = mean, data = df)
    
    ##   year price
    ## 1 2010     4
    ## 2 2011     2
    

    Will id be present next year?

    with(df, unique(year + 1) %in% unique(year)) |> as.numeric()
    
    ## [1] 1 0
    

    Put it all together

    experience_by = function (df, what) {
      out = with(df, table(year, get(what)) > 0) |> apply(2, cumsum) |> as.data.frame()
      names(out) = paste('exp', what, names(out), sep = '_')
      return(out)
    }
    
    by_type = experience_by(df, 'type')
    by_place = experience_by(df, 'place')
    avg_price = aggregate(price ~ year, FUN = mean, data = df)
    id_repeat_next_year = with(df, as.numeric(unique(year + 1) %in% unique(year)))
    
    cbind(avg_price, by_type, by_place, id_repeat_next_year)
    
    ##      year price exp_type_adult exp_type_cat exp_type_dog exp_type_kid
    ## 2010 2010     4              1            0            0            1
    ## 2011 2011     2              1            1            1            2
    ##      exp_place_hosp exp_place_house id_repeat_next_year
    ## 2010              1               1                   1
    ## 2011              2               2                   0
    

    For the whole dataset this should be lapplyed over the splited data.frame. Something of this sort:

    split(df, ~ id) |>
    lapply(function (x) {
      by_type = experience_by(df, 'type')
      by_place = experience_by(df, 'place')
      avg_price = aggregate(price ~ year, FUN = mean, data = df)
      id_repeat_next_year = with(df, as.numeric(unique(year + 1) %in% unique(year)))
    
      cbind(avg_price, by_type, by_place, id_repeat_next_year)
    })
    
    ## $A
    ##      year price exp_type_adult exp_type_cat exp_type_dog exp_type_kid
    ## 2010 2010     4              1            0            0            1
    ## 2011 2011     2              1            1            1            2
    ##      exp_place_hosp exp_place_house id_repeat_next_year
    ## 2010              1               1                   1
    ## 2011              2               2                   0
    

    From here it should be easy to recover ids from the list and rbind all data.frames. I am not sure how to fill in values when there are more ids that do not share the same types or places, since this will originate differing number of columns on each data.frame...


    EDIT:

    With the new data frame and a new id:

    year <- c(2010, 2010, 2010, 2010, 2011, 2011, 2011, 2010, 2011)
    id <- c("A", "A" , "A" , "A" , "A" , "A" , "A", "B", "B")
    type <- c("kid", "kid", "adult", "kid", "kid", "dog", "cat", "kid", "kid")
    place <- c("hosp", "hosp", "house", "hosp", "hosp", "hosp", "house", "hosp", "hosp")
    price <- c(2, 3, 6, 5, 1, 2, 3, 4, 5)
    
    df <- data.frame(year, id, type, place, price)
    
    # use `local()` instead of pipes; `x` instead of `df`
    df_new = local({
      dfs = split(df, ~ id)
      dfl = lapply(dfs, function (x) {
        by_type = experience_by(x, 'type')
        by_place = experience_by(x, 'place')
        avg_price = aggregate(price ~ year, FUN = mean, data = x)
        id_repeat_next_year = with(x, as.numeric(unique(year + 1) %in% unique(year)))
      
        cbind(avg_price, by_type, by_place, id_repeat_next_year)
      })
    
      # recover `id`s
      new_id = rep(names(dfl), times = sapply(dfl, FUN = nrow))
    
      # create missing cols and combine `dfl`
      all_cols = sapply(dfl, FUN = names) |> unlist() |> unique()
      out = data.frame()
      for (i in dfl) {
        col_present = all_cols %in% names(i)
        if (!all(col_present)) {
          for (cl in all_cols[!col_present]) {
            i[[cl]] = numeric(nrow(i))
          }
        }
      out = rbind(out, i)
      }
      out = data.frame(id = new_id, out, row.names = NULL)
      out
    })
    
    df_new
    
    ##   id year price exp_type_adult exp_type_cat exp_type_dog exp_type_kid
    ## 1  A 2010     4              1            0            0            1
    ## 2  A 2011     2              1            1            1            2
    ## 3  B 2010     4              0            0            0            1
    ## 4  B 2011     5              0            0            0            2
    ##   exp_place_hosp exp_place_house id_repeat_next_year
    ## 1              1               1                   1
    ## 2              2               2                   0
    ## 3              1               0                   1
    ## 4              2               0                   0