Search code examples
rdateinterpolationimputation

Data imputation by linear interpolation according to date in R


I have a large dataset of mineral nitrogen values from different plots which includes some missing data were on some dates we could not take samples. it is known that mineral N values in soil change linearly between samplings.

for the sake of simplification I have created a data frame that has 10 plots with 4 dates (with different distances between them) with missing data in one of the dates:

df <- data.frame(plot= c(1,2,3,4,5,6,7,8,9,10),
                  date = c("2020-10-01", "2020-10-01","2020-10-01","2020-10-01","2020-10-01","2020-10-01","2020-10-01","2020-10-01","2020-10-01","2020-10-01",
                           "2020-10-08", "2020-10-08","2020-10-08","2020-10-08","2020-10-08","2020-10-08","2020-10-08","2020-10-08","2020-10-08","2020-10-08",
                           "2020-10-29","2020-10-29","2020-10-29","2020-10-29","2020-10-29","2020-10-29","2020-10-29","2020-10-29","2020-10-29","2020-10-29",
                           "2020-11-05","2020-11-05","2020-11-05","2020-11-05","2020-11-05","2020-11-05","2020-11-05","2020-11-05","2020-11-05","2020-11-05"),
                  Nmin = c(100, 120,  50,  60,  70,  80, 100,  70,  30,  50,  90, 130,  60,  60,  60,  90, 105,  60,  25,  40,  NA,  NA,  NA,  NA,  NA,  NA,  
                           NA,  NA,  NA,  NA, 50, 170, 100, 60,  20, 130, 125,  20,   5,   0))
df$date <- as.Date(df$date, format="%d.%m.%Y")
df$Nmin <- as.numeric(df$Nmin)

is there a function that can calculate the missing values of Nmin plot-wise and takes in concideration the time between samplings (date)?


Solution

  • Using approx.

    df <- transform(df, flag=ifelse(is.na(Nmin), 1, 0))  ## set flag for sake of identification
    
    res <- by(df, df$plot, transform, Nmin=approx(date, Nmin, date)$y) |> unsplit(df$plot)
    res
    #    plot       date Nmin flag
    # 1     1 2020-10-01  100    0
    # 2     2 2020-10-01  120    0
    # 3     3 2020-10-01   50    0
    # 4     4 2020-10-01   60    0
    # 5     5 2020-10-01   70    0
    # 6     6 2020-10-01   80    0
    # 7     7 2020-10-01  100    0
    # 8     8 2020-10-01   70    0
    # 9     9 2020-10-01   30    0
    # 10   10 2020-10-01   50    0
    # 11    1 2020-10-08   90    0
    # 12    2 2020-10-08  130    0
    # 13    3 2020-10-08   60    0
    # 14    4 2020-10-08   60    0
    # 15    5 2020-10-08   60    0
    # 16    6 2020-10-08   90    0
    # 17    7 2020-10-08  105    0
    # 18    8 2020-10-08   60    0
    # 19    9 2020-10-08   25    0
    # 20   10 2020-10-08   40    0
    # 21    1 2020-10-29   60    1
    # 22    2 2020-10-29  160    1
    # 23    3 2020-10-29   90    1
    # 24    4 2020-10-29   60    1
    # 25    5 2020-10-29   30    1
    # 26    6 2020-10-29  120    1
    # 27    7 2020-10-29  120    1
    # 28    8 2020-10-29   30    1
    # 29    9 2020-10-29   10    1
    # 30   10 2020-10-29   10    1
    # 31    1 2020-11-05   50    0
    # 32    2 2020-11-05  170    0
    # 33    3 2020-11-05  100    0
    # 34    4 2020-11-05   60    0
    # 35    5 2020-11-05   20    0
    # 36    6 2020-11-05  130    0
    # 37    7 2020-11-05  125    0
    # 38    8 2020-11-05   20    0
    # 39    9 2020-11-05    5    0
    # 40   10 2020-11-05    0    0
    

    Let's take a look at the plot.

    clr <- rainbow(10)
    with(res, plot(Nmin ~ date, type='n'))
    by(res, res$plot, with, points(jitter(Nmin) ~ date, type='b', pch=ifelse(flag == 1, 21, 16), col=clr[plot], bg='white'))
    legend('topleft', legend=paste('plot', 1:10), lty=1, col=clr, ncol=4, bty='n', cex=.7)
    

    enter image description here

    Note: For non-linear inter/extrapolation, see this answer.


    Data:

    df <- structure(list(plot = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 
    3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 
    3, 4, 5, 6, 7, 8, 9, 10), date = structure(c(18536, 18536, 18536, 
    18536, 18536, 18536, 18536, 18536, 18536, 18536, 18543, 18543, 
    18543, 18543, 18543, 18543, 18543, 18543, 18543, 18543, 18564, 
    18564, 18564, 18564, 18564, 18564, 18564, 18564, 18564, 18564, 
    18571, 18571, 18571, 18571, 18571, 18571, 18571, 18571, 18571, 
    18571), class = "Date"), Nmin = c(100, 120, 50, 60, 70, 80, 100, 
    70, 30, 50, 90, 130, 60, 60, 60, 90, 105, 60, 25, 40, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, 50, 170, 100, 60, 20, 130, 125, 
    20, 5, 0), flag = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0)), class = "data.frame", row.names = c(NA, -40L
    ))