I'm trying to interpolate annual data into monthly data. I've seen several examples online, but I can't get them to work Here's one, here's another, and I get errors of all kinds.
My data looks like this
structure(list(countryname = c("Aruba", "Aruba", "Aruba", "Aruba",
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba",
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba",
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba",
"Aruba", "Aruba", "Aruba", "Aruba", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Angola",
"Angola", "Angola", "Angola", "Angola", "Angola", "Angola", "Angola",
"Angola", "Angola", "Angola", "Angola", "Angola", "Angola", "Angola",
"Angola", "Angola", "Angola", "Angola", "Angola", "Anguilla",
"Anguilla", "Anguilla", "Anguilla", "Anguilla", "Anguilla", "Anguilla",
"Anguilla", "Anguilla", "Anguilla"), year = c(1986, 1987, 1988,
1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
2011, 2012, 2013, 2014, 1958, 1959, 1960, 1961, 1962, 1963, 1964,
1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975,
1976, 1977, 1978, 1979, 1980, 1981, 1982, 1984, 1985, 1986, 1987,
1988, 1989, 1990, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
2014, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 1990,
1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999), total = c(286.511,
344.328, 467.524, 539.787, 609.052, 690.367, 742.735, 793.093,
946.521, 1017.982, 1121.778, 1185.47, 1248.148, 1376.097, 1502.349,
1581.668, 1783.635, 2012.886, 2090.128, 2283.757, 2382.169, 2479.276,
2578.016, 2555.033, 2582.489, 2656.015, 2747.218, 2888.874, 3009.389,
1527, 2037, 2244, 2162, 1760, 1713, 1867, 2252, 2395, 2449, 2488.4,
2872.3, 2853.1, 2928, 3660.3, 4280.4, 5214.8, 5016.4, 6243.1,
5493.6, 6770.4, 9789.5, 9603.1, 10053, 4185.38, 4787.87, 6728.06,
8773.05, 12698.5, 17579.82, 19972.779, 26547.45, 16855.486058,
33202.5772169922, 48354.2870012583, 64742.29199, 84061.57373,
42170.25521, 46008.7436596187, 49215.7766771853, 46365.3341990244,
0.718514, 26.547404, 90.561909, 90.0346783, 441.87615963, 1845.82841034,
7453.3254978, 23556.243282338, 54332.057918413, 89699.02591742,
138396.82327341, 273708.10979785, 492519.2833002, 801048.13352684,
1285702.23670599, 1532272.06111931, 1973567.61658344, 2451384.32434691,
2819717.17874511, 2852043.16889253, 135.972994003056, 165.698,
190, 200.98, 199.009, 212.701, 240.183, 282.136, 308.716, 358.751
), hh = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
12409.359888, 2669.05197895018, 34449.662, 3595.39872, 10187.97902,
10224.02617, 11409.47128, 9743.61473, 8246.96039, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 50.331, 55.832, 79.786, 85.04, 93.316, 104.006, 126.771,
147.306, 171.725), corp = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 4446.12617, 30533.525238042, 13904.6250012583,
61146.89327, 73873.59471, 31946.22904, 34599.2723796187, 39472.1619471853,
38118.3738090244, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 115.367, 134.168, 121.194,
113.969, 119.385, 136.177, 155.365, 161.41, 187.026), agr = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1.08, 2.09, 1.415, 1.276,
1.703, 1.85, 2.434, 2.385, 2.161), manufac = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 0.103556200987511, 0.241631135637525,
0.691812953819343, 1.02980888759802, 4.50900958466454, 5.02391402846355,
4.91891954690677, 4.36805808887598, 9.904), real_estate = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 13.327, 13.813, 6.75,
13.223, 11.223, 13.284, 13.509, 11.515, 11.234), whole_sale = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 77.119, 92.189, 91.358,
79.837, 76.949, 79.75, 92.241, 95.118, 109.549), transport_comm = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 14.572, 13.047, 12.774,
11.398, 8.998, 10.174, 10.226, 10.132, 8.803), others = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 9.16544379901249, 12.7873688643625,
8.20518704618064, 7.20519111240198, 16.0029904153354, 26.0950859715364,
32.0360804530932, 37.891941911124, 45.375), error = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_)), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
Note that this is a panel of countries, so the interpolation should be done separately for each country.
My best attempt so far is adapting the example in the second link
expand_data <- function(x) {
years <- min(x$year):max(x$year)
months_data <- 1:12
grid <- expand.grid(month=months_data, year=years)
x$quarter <- 1
merged <- merge(grid, x, by=c('year', 'month'), all.x=TRUE) # Use merge instead of left_join
merged$country <- x$country[1]
return(merged)
}
interpolate_data <- function(data) {
xout <- 1:nrow(data)
y <- data$value
interpolation <- approx(x=xout[!is.na(y)], y=y[!is.na(y)], xout=xout)
data$value_interpolated <- interpolation$y
return(data)
}
expand_and_interpolate <- function(x) interpolate_data(expand_data(x))
monthly_data <- credit_data %>%
group_by(countryname) %>%
do(expand_and_interpolate(.))
But I get this error
Error in fix.by(by.y, y) : 'by' must specify a uniquely valid column
This process converts year
to a Date
-class column, then expands it to be every month between the first and last observed dates. From there, it uses approx
across all other fields.
library(dplyr)
out <- credit_data |>
mutate(date = as.Date(paste0(year, "-01-01"))) |>
reframe(
.by = countryname,
month = seq(min(date), max(date), by = "month"),
year = as.integer(format(month, format = "%Y")),
across(-c(year, date, month),
~ if (any(!is.na(.x))) approx(date, .x, xout = month, na.rm = TRUE)$y else rep(.x[1], length(month)))
)
filter(out, countryname == "Anguilla")[20:30,]
# # A tibble: 11 × 13
# countryname month year total hh corp agr manufac real_estate whole_sale transport_comm others error
# <chr> <date> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
# 1 Anguilla 1991-08-01 1991 180. 53.5 126. 1.67 0.184 13.6 85.9 13.7 11.3 NA
# 2 Anguilla 1991-09-01 1991 182. 54.0 128. 1.75 0.195 13.7 87.2 13.6 11.6 NA
# 3 Anguilla 1991-10-01 1991 184. 54.4 129. 1.84 0.207 13.7 88.4 13.4 11.9 NA
# 4 Anguilla 1991-11-01 1991 186. 54.9 131. 1.92 0.219 13.7 89.7 13.3 12.2 NA
# 5 Anguilla 1991-12-01 1991 188. 55.4 133. 2.00 0.230 13.8 90.9 13.2 12.5 NA
# 6 Anguilla 1992-01-01 1992 190 55.8 134. 2.09 0.242 13.8 92.2 13.0 12.8 NA
# 7 Anguilla 1992-02-01 1992 191. 57.9 133. 2.03 0.280 13.2 92.1 13.0 12.4 NA
# 8 Anguilla 1992-03-01 1992 192. 59.8 132. 1.98 0.315 12.7 92.1 13.0 12.0 NA
# 9 Anguilla 1992-04-01 1992 193. 61.8 131. 1.92 0.354 12.1 92.0 13.0 11.6 NA
# 10 Anguilla 1992-05-01 1992 194. 63.8 130. 1.87 0.390 11.5 91.9 13.0 11.3 NA
# 11 Anguilla 1992-06-01 1992 195. 65.8 129. 1.81 0.429 10.9 91.8 12.9 10.9 NA
The use of .by=
requires dplyr_1.1.0
or newer; if you have an older version, change from mutate(.by=c(..), stuff)
to group_by(..) |> mutate(stuff) |> ungroup()
.