I have a data.table test_dt
, in which some of the values are missing for each group
. Can someone suggest the fastest way to interpolate and fill the missing value?
In the graph form x-axis
is column a
and y-axis
is column b
, hench the interval distance of column a
should be considered while predicting b
.
test_dt = structure(list(group = c("B1", "B1", "B1", "B1", "B1", "B1", "B1", "B1", "C1", "C1", "C1", "C1", "C1", "C1", "C1", "C1"), a = c(165, 170, 185, 195, 200, 210, 220, 240, 1, 1.5, 2, 4.5, 5, 5.5, 7, 10), b = c(1.925, 0.575, 0.3, NA, NA, 2.825, 9.05, 27.9, 3.775, 3.225, 2.75, 0.255, 0.04, NA, NA, NA)), row.names = c(NA, -16L), class = c("data.table", "data.frame"), index = integer(0))
> test_dt
group a b
1: B1 165.0 1.925
2: B1 170.0 0.575
3: B1 185.0 0.300
4: B1 195.0 NA
5: B1 200.0 NA
6: B1 210.0 2.825
7: B1 220.0 9.050
8: B1 240.0 27.900
9: C1 1.0 3.775
10: C1 1.5 3.225
11: C1 2.0 2.750
12: C1 4.5 0.255
13: C1 5.0 0.040
14: C1 5.5 NA
15: C1 7.0 NA
16: C1 10.0 NA
Following are some conditions -
I tried to solve it using na.spline
but the result is not correct, especially for group C1
, where value of column a
is 7 and 10
test_dt[, predicted := zoo::na.spline(zoo(.SD), x = a)[, 2], by = c("group")]
> test_dt
group a b predicted
1: B1 165.0 1.925 1.92500000
2: B1 170.0 0.575 0.57500000
3: B1 185.0 0.300 0.30000000
4: B1 195.0 NA 0.18981191
5: B1 200.0 NA 0.40167712
6: B1 210.0 2.825 2.82500000
7: B1 220.0 9.050 9.05000000
8: B1 240.0 27.900 27.90000000
9: C1 1.0 3.775 3.77500000
10: C1 1.5 3.225 3.22500000
11: C1 2.0 2.750 2.75000000
12: C1 4.5 0.255 0.25500000
13: C1 5.0 0.040 0.04000000
14: C1 5.5 NA 0.03038963
15: C1 7.0 NA 1.67389631
16: C1 10.0 NA 16.44642968
Update - 1 Following is the second dataset. Using the solutions below, the predicted values are very high.
temp_dt = structure(list(group = c("K1", "K1", "K1", "K1", "K1", "K1",
"K1", "K1", "K1", "K1", "K1", "K1", "K1", "F1", "F1", "F1", "F1",
"F1", "F1"), a = c(185, 190, 195, 200, 202.5, 205, 210, 212.5,
215, 217.5, 220, 222.5, 225, 22, 23, 24, 25, 26, 35), b = c(NA,
NA, 0.45, 0.6, 1.05, 1.45, 2.7, 3.125, 4.3, NA, 7.05, 8.95, 10.35,
0.5, 0.7, 0.925, 1.2, 1.75, NA), predicted = c(463741.812236,
14.7721571, 0.45, 0.6, 1.05, 1.45, 2.7, 3.125, 4.3, 5.6346081,
7.05, 8.95, 10.35, 0.5, 0.7, 0.925, 1.2, 1.75, 65226323253857.9
)), row.names = c(NA, -19L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x563cccc504c0>, index = integer(0))
test_dt[, predicted := round(exp(na.spline(log(b), x = a, na.rm = FALSE)), 7), by = group]
group a b predicted
1: K1 185.0 NA 4.637418e+05
2: K1 190.0 NA 1.477216e+01
3: K1 195.0 0.450 4.500000e-01
4: K1 200.0 0.600 6.000000e-01
5: K1 202.5 1.050 1.050000e+00
6: K1 205.0 1.450 1.450000e+00
7: K1 210.0 2.700 2.700000e+00
8: K1 212.5 3.125 3.125000e+00
9: K1 215.0 4.300 4.300000e+00
10: K1 217.5 NA 5.634608e+00
11: K1 220.0 7.050 7.050000e+00
12: K1 222.5 8.950 8.950000e+00
13: K1 225.0 10.350 1.035000e+01
14: F1 22.0 0.500 5.000000e-01
15: F1 23.0 0.700 7.000000e-01
16: F1 24.0 0.925 9.250000e-01
17: F1 25.0 1.200 1.200000e+00
18: F1 26.0 1.750 1.750000e+00
19: F1 35.0 NA 6.522632e+13
Ultimately, you have to decide the interpolation procedure based on scientific background. However, in order to avoid producing negative values, the log-transformation is useful. In the following, I combine that with a spline interpolation.
library(data.table)
test_dt = data.table(group = c("B1", "B1", "B1", "B1", "B1", "B1",
"B1", "B1", "C1", "C1", "C1", "C1", "C1", "C1", "C1", "C1"),
a = c(165, 170, 185, 195, 200, 210, 220, 240, 1, 1.5, 2, 4.5, 5, 5.5, 7, 10),
b = c(1.925, 0.575, 0.3, NA, NA, 2.825, 9.05, 27.9, 3.775, 3.225, 2.75, 0.255,
0.04, NA, NA, NA))
library(zoo)
test_dt[, c := exp(na.spline(log(b), x = a, na.rm = FALSE)), by = group]
library(ggplot2)
ggplot(test_dt, aes(x = a, color = group)) +
geom_line(aes(y = c)) +
geom_point(aes(y = c, color = "interpolated")) +
geom_point(aes(y = b))