Search code examples
rdataframegroup-bymissing-data

mean imputation by filling in missing dates and by symetrically iterating over dates up and down to find the closest value available in r


I need to impute all missing dates between the available dates for each id's and then go symmetrically up and down to impute missing. Also, not always I need the average between two, eg: when I go 2 dates up and down and I see only 1 value, then I would impute that value.

df1 <- data.frame(id = c(11,11,11,11,11,11,11,11),
                  Date = c("2021-06-01", "2021-06-05", "2021-06-08", "2021-06-09", "2021-06-14", "2021-06-16", "2021-06-20", "2021-06-21"),
                  price = c(NA, NA,100, NA, 50, NA, 200, NA)
)

There is an excellent solution for missing imputation on a symmetrical iteration by @lovalery how to groupby and take mean of value by symetrically looping forward and backward on the date value in r

In the above solution, the date present is used, but this can be an issue when there is a large number of dates missing in between. Hence I wanted to insert all missing dates in between and then symmetrically move in both directions until I get at least 1 value in either direction, I need to retain it, if 2 values I need the mean.

enter image description here

Update: we also need to consider cases when the price is only present in the first date or in the last date. Also if same price is present in multiple dates

df1 <- data.frame(id = c(11,11,11,11,11,11,11,11,
                     12,12,12,
                     13,13,13),
              Date = c("2021-06-01", "2021-06-05", "2021-06-08", "2021-06-09", "2021-06-14", "2021-06-16", "2021-06-20", "2021-06-21",
                       "2021-07-01","2021-07-03","2021-07-05",
                       "2021-08-01","2021-08-03","2021-08-05"),
              price = c(200, NA,100, NA, 50, NA, 200, NA,
                        10,NA,NA,
                        NA,NA,20)

)

I used the function NA_imputations_dates_v2 by @lovalery

df1 <- setDT(df1)
df2 <- NA_imputations_dates_v2(df1)
df3 <- merge(df1,df2,by = c("id","Date"),all.x = T)

Solution

  • Please find below with a reprex one possible solution using the data.table and padr libraries.

    I built a function to make it easier to use.

    Reprex

    • Your dataset #1
    df1 <- data.frame(id = c(11,11,11,11,11,11,11,11),
                      Date = c("2021-06-01", "2021-06-05", "2021-06-08", "2021-06-09", "2021-06-14", "2021-06-16", "2021-06-20", "2021-06-21"),
                      price = c(NA, NA,100, NA, 50, NA, 200, NA))
    
    • Code of the NA_imputations_dates() function
    library(data.table)
    library(padr)
    
    NA_imputations_dates <- function(x) {
      
      setDT(x)[, Date := as.Date(Date)]
      
      x <- pad(x, interval = "day", group = "id")
      
      setDT(x)[, rows := .I]
      
      z <- x[, .I[!is.na(price)]]
      
      id_1 <- z[-length(z)]
      id_2 <- z[-1]
      
      values <- x[z, .(price = price, id = id)]
      values_1 <- values[-nrow(values)]
      names(values_1) <- c("price_1", "id_o1")
      values_2 <- values[-1]
      names(values_2) <- c("price_2", "id_o2")
      
      subtract <- z[-1] - z[-length(z)]
      
      r <- data.table(id_1, values_1, id_2, values_2, subtract)
      
      r <- r[, `:=` (id_mean = fifelse(subtract > 2 & subtract %% 2 == 0, id_1+(subtract/2), (id_1+id_2)/2),
                     mean = fifelse(subtract >= 2 & subtract %% 2 == 0 & id_o1 == id_o2, (price_1+price_2)/2, NA_real_))
             ][, `:=` (price_1 = NULL, id_1 = NULL, id_o1 = NULL, id_2 = NULL, price_2 = NULL, id_o2 = NULL, subtract = NULL)
               ][x, on = .(id_mean = rows)][, dummy := cumsum(!is.na(mean)), by = .(id)]
      
      h <-  r[, .(price = na.omit(price)), by = .(dummy)]
      
      Results <- r[, price := NULL
                   ][h, on = .(dummy)
                     ][, price := fifelse(!is.na(mean), mean, price)
                       ][, `:=` (id_mean = NULL, mean = NULL, dummy = NULL)][]
      
      return(Results)
    }
    
    • Output of the NA_imputations_dates() function
    NA_imputations_dates(df1)
    #>     id       Date price
    #>  1: 11 2021-06-01   100
    #>  2: 11 2021-06-02   100
    #>  3: 11 2021-06-03   100
    #>  4: 11 2021-06-04   100
    #>  5: 11 2021-06-05   100
    #>  6: 11 2021-06-06   100
    #>  7: 11 2021-06-07   100
    #>  8: 11 2021-06-08   100
    #>  9: 11 2021-06-09   100
    #> 10: 11 2021-06-10   100
    #> 11: 11 2021-06-11    75
    #> 12: 11 2021-06-12    50
    #> 13: 11 2021-06-13    50
    #> 14: 11 2021-06-14    50
    #> 15: 11 2021-06-15    50
    #> 16: 11 2021-06-16    50
    #> 17: 11 2021-06-17   125
    #> 18: 11 2021-06-18   200
    #> 19: 11 2021-06-19   200
    #> 20: 11 2021-06-20   200
    #> 21: 11 2021-06-21   200
    #>     id       Date price
    

    Created on 2021-12-12 by the reprex package (v2.0.1)


    EDIT OF THE FUNCTION TO PROCESS YOUR MORE GENERAL DATASET #2

    As a follow-up of your comment, please find below the modified version of the function (i.e. NA_imputations_dates_v2()) to deal with the more general case provided by your new dataset (i.e. dataset #2).

    Reprex

    • Your dataset #2
    df1 <- data.frame(id = c(11,11,11,11,11,11,11,11,
                             12,12,12,
                             13,13,13),
                      Date = c("2021-06-01", "2021-06-05", "2021-06-08", "2021-06-09", "2021-06-14", "2021-06-16", "2021-06-20", "2021-06-21",
                               "2021-07-01","2021-07-03","2021-07-05",
                               "2021-08-01","2021-08-03","2021-08-05"),
                      price = c(NA, NA,100, NA, 50, NA, 200, NA,
                                10,NA,NA,
                                NA,NA,20))
    
    • Code of the NA_imputations_dates_v2() function
    library(data.table)
    library(padr)  
      
    NA_imputations_dates_v2 <- function(x) {
      
      setDT(x)[, Date := as.Date(Date)]
      
      x <- pad(x, interval = "day", group = "id")
    
      setDT(x)[, rows := .I]
      
      z <- x[, .I[!is.na(price)]]
      
      id_1 <- z[-length(z)]
      id_2 <- z[-1]
      
      values <- x[z, .(price = price, id = id)]
      values_1 <- values[-nrow(values)]
      names(values_1) <- c("price_1", "id_o1")
      values_2 <- values[-1]
      names(values_2) <- c("price_2", "id_o2")
      
      subtract <- z[-1] - z[-length(z)]
      
      r <- data.table(id_1, values_1, id_2, values_2, subtract)
    
      r <- r[, `:=` (id_mean = fifelse(subtract > 2 & subtract %% 2 == 0 & id_o1 == id_o2, id_1+(subtract/2), NA_real_),
                     mean = fifelse(subtract >= 2 & subtract %% 2 == 0 & id_o1 == id_o2, (price_1+price_2)/2, NA_real_))
             ][, `:=` (price_1 = NULL, id_1 = NULL, id_o1 = NULL, id_2 = NULL, price_2 = NULL, id_o2 = NULL, subtract = NULL)
               ][x, on = .(id_mean = rows)][, dummy := cumsum(!is.na(mean)), by = .(id)]
      
      h <-  r[, .(price = na.omit(price)), by = .(dummy, id)]
      
      Results <- r[, price := NULL
                   ][h, on = .(dummy, id)
                     ][, price := fifelse(!is.na(mean), mean, price)
                       ][, `:=` (id_mean = NULL, mean = NULL, dummy = NULL)][]
      
      return(Results)
    } 
    
    • Output of the NA_imputations_dates_v2() function
    NA_imputations_dates_v2(df1)
    #>     id       Date price
    #>  1: 11 2021-06-01   100
    #>  2: 11 2021-06-02   100
    #>  3: 11 2021-06-03   100
    #>  4: 11 2021-06-04   100
    #>  5: 11 2021-06-05   100
    #>  6: 11 2021-06-06   100
    #>  7: 11 2021-06-07   100
    #>  8: 11 2021-06-08   100
    #>  9: 11 2021-06-09   100
    #> 10: 11 2021-06-10   100
    #> 11: 11 2021-06-11    75
    #> 12: 11 2021-06-12    50
    #> 13: 11 2021-06-13    50
    #> 14: 11 2021-06-14    50
    #> 15: 11 2021-06-15    50
    #> 16: 11 2021-06-16    50
    #> 17: 11 2021-06-17   125
    #> 18: 11 2021-06-18   200
    #> 19: 11 2021-06-19   200
    #> 20: 11 2021-06-20   200
    #> 21: 11 2021-06-21   200
    #> 22: 12 2021-07-01    10
    #> 23: 12 2021-07-02    10
    #> 24: 12 2021-07-03    10
    #> 25: 12 2021-07-04    10
    #> 26: 12 2021-07-05    10
    #> 27: 13 2021-08-01    20
    #> 28: 13 2021-08-02    20
    #> 29: 13 2021-08-03    20
    #> 30: 13 2021-08-04    20
    #> 31: 13 2021-08-05    20
    #>     id       Date price
    

    Created on 2021-12-14 by the reprex package (v2.0.1)


    SECOND EDIT OF THE FUNCTION TO PROCESS YOUR MORE GENERAL DATASET #3

    As a follow-up of your second comment, please find below the modified version of the function (i.e. NA_imputations_dates_v3()) to deal with the more general case provided by your new dataset (i.e. dataset #3).

    Reprex

    • Your dataset #3
    df1 <- data.frame(id = c(11,11,11,11,11,11,11,11,
                             12,12,12,
                             13,13,13),
                      Date = c("2021-06-01", "2021-06-05", "2021-06-08", "2021-06-09", "2021-06-14", "2021-06-16", "2021-06-20", "2021-06-21",
                               "2021-07-01","2021-07-03","2021-07-05",
                               "2021-08-01","2021-08-03","2021-08-05"),
                      price = c(NA, NA,100, NA, 50, NA, 200, 200,
                                10,NA,NA,
                                NA,NA,20))
    
    • Code of the NA_imputations_dates_v3() function
    library(data.table)
    library(padr)  
      
    NA_imputations_dates_v3 <- function(x) {
      
      setDT(x)[, Date := as.Date(Date)]
      
      x <- pad(x, interval = "day", group = "id")
      
      setDT(x)[, rows := .I]
      
      z <- x[, .I[!is.na(price)]]
      
      id_1 <- z[-length(z)]
      id_2 <- z[-1]
      
      values <- x[z, .(price = price, id = id)]
      values_1 <- values[-nrow(values)]
      names(values_1) <- c("price_1", "id_o1")
      values_2 <- values[-1]
      names(values_2) <- c("price_2", "id_o2")
      
      subtract <- z[-1] - z[-length(z)]
      
      r <- data.table(id_1, values_1, id_2, values_2, subtract)
      
      r <- r[, `:=` (id_mean = fifelse(subtract > 2 & subtract %% 2 == 0 & id_o1 == id_o2, id_1+(subtract/2), NA_real_),
                     mean = fifelse(subtract >= 2 & subtract %% 2 == 0 & id_o1 == id_o2, (price_1+price_2)/2, NA_real_))
             ][, `:=` (price_1 = NULL, id_1 = NULL, id_o1 = NULL, id_2 = NULL, price_2 = NULL, id_o2 = NULL, subtract = NULL)
               ][x, on = .(id_mean = rows)][, dummy := cumsum(!is.na(mean)), by = .(id)]
      
      r <- r[, price_lag := shift(price, 1), by = .(dummy, id)]
      
      h <-  r[, .(price = na.omit(price)), by = .(dummy, id, price_lag)]
      
      h <- h[h[,.I[is.na(price_lag)]]][, price_lag := NULL]
      
      Results <- r[, `:=` (price = NULL, price_lag = NULL)
                   ][h, on = .(dummy, id)
                     ][, price := fifelse(!is.na(mean), mean, price)
                       ][, `:=` (id_mean = NULL, mean = NULL, dummy = NULL)][]
      
      return(Results)
    }   
    
    • Output of the NA_imputations_dates_v3() function
    NA_imputations_dates_v3(df1)  
    #>     id       Date price
    #>  1: 11 2021-06-01   100
    #>  2: 11 2021-06-02   100
    #>  3: 11 2021-06-03   100
    #>  4: 11 2021-06-04   100
    #>  5: 11 2021-06-05   100
    #>  6: 11 2021-06-06   100
    #>  7: 11 2021-06-07   100
    #>  8: 11 2021-06-08   100
    #>  9: 11 2021-06-09   100
    #> 10: 11 2021-06-10   100
    #> 11: 11 2021-06-11    75
    #> 12: 11 2021-06-12    50
    #> 13: 11 2021-06-13    50
    #> 14: 11 2021-06-14    50
    #> 15: 11 2021-06-15    50
    #> 16: 11 2021-06-16    50
    #> 17: 11 2021-06-17   125
    #> 18: 11 2021-06-18   200
    #> 19: 11 2021-06-19   200
    #> 20: 11 2021-06-20   200
    #> 21: 11 2021-06-21   200
    #> 22: 12 2021-07-01    10
    #> 23: 12 2021-07-02    10
    #> 24: 12 2021-07-03    10
    #> 25: 12 2021-07-04    10
    #> 26: 12 2021-07-05    10
    #> 27: 13 2021-08-01    20
    #> 28: 13 2021-08-02    20
    #> 29: 13 2021-08-03    20
    #> 30: 13 2021-08-04    20
    #> 31: 13 2021-08-05    20
    #>     id       Date price
    

    Created on 2021-12-14 by the reprex package (v2.0.1)