I have a dataset with about 50 columns (all indicators I got from World Bank), Country Code and Year. These 50 columns are not all complete, and I would like to fill in the missing values based on an lm fit for the column for that specific country. For example:
Doing this for a single country and a single column is absolutely fine when following these steps here: Filling NA using linear regression in R
However, I have over 180 different countries I want to do this to. And I want this to work for each indicator per country (so 50 columns total) So in a way, each country and each column would have its own linear regression model that fills out the missing values.
Here is how it looked after I did the steps above: This is the expected output for ONE column. I would like to do this for EVERY column by individual country groups.
However, the data looks like this:
There are numerous countries and columns that I want to perform this on just like the post above.
This is for a project I am working on for my data-mining / statistics class. Any help would be appreciated and thanks so much in advance!
EDIT
I tried this:
# Reading CSV File
WorldBank=read.csv(file.choose(),header=T, stringsAsFactors=TRUE)
colnames(WorldBank)[1] <- "Country_Code"
colnames(WorldBank)[2] <- "Year"
Countries <- t.data.frame(distinct((WorldBank)[1])) # to create an
array of distinct countries I can loop over
CountryCount <- ncol(Countries) #this is what I intend to use
CountryCountDemo <- 1 # this is what I used for this example to only look at one country.
# Making new data
Worldbankone <- WorldBank
#CountryVarList <- names(Worldbankone[,3:57]) # Column 3 to 57. Not including year and country code.
df_total = data.frame() # Final data that will hold each iteration
for (i in 1:CountryCountDemo) {
for (j in names(Worldbankone[,3:57])){
country_temp <- Countries[i]
Worldbanktemp <- Worldbankone %>% filter(Country_Code ==
country_temp)
# If entire column has a null, make it all 0 (this needs to happen to perform linear regression on each column with all NAs)
if (all(is.na(c(Worldbanktemp[[j]]))) == TRUE) {
Worldbanktemp[[j]] <- 0
} else {
# Staging data to keep all values that do not have an NA value for that column for linear regression model
Worldbanktemp2 <- Worldbanktemp %>% filter(!is.na(.[[j]]))
# Creating fit. The only way to incorporate a variable like 'j' is by pasting it in.
fit <- lm(formula = paste0("`",j,"` ~ Year"), data = Worldbanktemp2)
# making a list that holds all the predicted values
pred = predict(fit,Worldbanktemp)
# Here is where things go wrong...
# I cannot used mutate here. It has no impact on the Worldbanktemp3 when I mutate Worldbanktemp. I tried both if_else and ifelse. I also tried as.name(j) and unquote(j) and !!as.name(j).
Worldbanktemp3 <- Worldbanktemp %>%
# Replace NA with pred in var1
mutate(!!j := if_else(is.na(!!as.name(j)), pred, !!as.name(j)))}
#df_total = rbind(df_total,Worldbanktemp3)}
#Fitting code before this
#Worldbankresult <-rbind(Worldbankresult,Worldbanktemp2)
}}
EDIT 2
# Reproducible example
sampledata2 = structure(list(Country_Code = structure(c(3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L), .Label = c("", "ABW", "AFG", "AGO", "ALB", "AND",
"ARE", "ARG", "ARM", "ASM", "ATG", "AUS", "AUT", "AZE", "BDI",
"BEL", "BEN", "BFA", "BGD", "BGR", "BHR", "BHS", "BIH", "BLR",
"BLZ", "BMU", "BOL", "BRA", "BRB", "BRN", "BTN", "BWA", "CAF",
"CAN", "CHE", "CHI", "CHL", "CHN", "CIV", "CMR", "COD", "COG",
"COL", "COM", "CPV", "CRI", "CUB", "CUW", "CYM", "CYP", "CZE",
"DEU", "DJI", "DMA", "DNK", "DOM", "DZA", "ECU", "EGY", "ERI",
"ESP", "EST", "ETH", "FIN", "FJI", "FRA", "FRO", "FSM", "GAB",
"GBR", "GEO", "GHA", "GIB", "GIN", "GMB", "GNB", "GNQ", "GRC",
"GRD", "GRL", "GTM", "GUM", "GUY", "HKG", "HND", "HRV", "HTI",
"HUN", "IDN", "IMN", "IND", "IRL", "IRN", "IRQ", "ISL", "ISR",
"ITA", "JAM", "JOR", "JPN", "KAZ", "KEN", "KGZ", "KHM", "KIR",
"KNA", "KOR", "KWT", "LAO", "LBN", "LBR", "LBY", "LCA", "LIE",
"LKA", "LSO", "LTU", "LUX", "LVA", "MAC", "MAF", "MAR", "MCO",
"MDA", "MDG", "MDV", "MEX", "MHL", "MKD", "MLI", "MLT", "MMR",
"MNE", "MNG", "MNP", "MOZ", "MRT", "MUS", "MWI", "MYS", "NAM",
"NCL", "NER", "NGA", "NIC", "NLD", "NOR", "NPL", "NRU", "NZL",
"OMN", "PAK", "PAN", "PER", "PHL", "PLW", "PNG", "POL", "PRI",
"PRK", "PRT", "PRY", "PSE", "PYF", "QAT", "ROU", "RUS", "RWA",
"SAU", "SDN", "SEN", "SGP", "SLB", "SLE", "SLV", "SMR", "SOM",
"SRB", "SSD", "STP", "SUR", "SVK", "SVN", "SWE", "SWZ", "SXM",
"SYC", "SYR", "TCA", "TCD", "TGO", "THA", "TJK", "TKM", "TLS",
"TON", "TTO", "TUN", "TUR", "TUV", "TZA", "UGA", "UKR", "URY",
"USA", "UZB", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WSM",
"XKX", "YEM", "ZAF", "ZMB", "ZWE"), class = "factor"), Year = c(2000L,
2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L,
2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 2016L, 2017L, 2018L,
2019L, 2020L, 2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L,
2007L, 2008L, 2009L, 2010L, 2011L, 2012L, 2013L, 2014L, 2015L,
2016L, 2017L, 2018L, 2019L), EG.ELC.ACCS.ZS = c(NA, NA, NA, NA,
NA, 22.29526901, 28.09996223, 33.9016037, 42.4, 45.52068329,
42.7, 43.22201891, 69.1, 68.98294067, 89.5, 71.5, 97.7, 97.7,
98.71562195, 97.7, NA, 100, 100, 100, 100, 100, 100, 100, 100,
100, 100, 100, 100, 99.9, 100, 99.95, 99.98, 99.89, 99.89, 100,
100), NY.GDP.COAL.RT.ZS = c(NA, NA, 0.004340991, 0.007421791,
0.016454885, 0.010903631, 0.011152411, 0.076415204, 0.224465173,
0.126422321, 0.212188085, 0.495356516, 0.270285101, 0.237255122,
0.226599986, 0.162117663, 0.24730234, 0.397626964, 0.535542773,
0.373839671, NA, 0.000620564, 0.001277136, 0.000274595, 0.000890331,
0.006822021, 0.003077833, 0.002931413, 0.004030065, 0.013985094,
0.000729611, 0.001394767, 0.000801741, 0.000360916, 0.000108033,
0, 0.005924115, 0.00038768, 0.015576529, 0.033583408, 0.024377128
), NY.GDP.FRST.RT.ZS = c(NA, NA, 0.958003595, 0.664331186, 0.387787089,
0.332204753, 0.454077242, 0.342748815, 0.353698168, 0.274778262,
0.358436235, 0.309317612, 0.263852446, 0.259193525, 0.267631558,
0.290360037, 0.397862173, 0.26727029, 0.335310319, 0.367809087,
NA, 0.126636828, 0.062644576, 0.066551134, 0.062998763, 0.053996825,
0.04788751, 0.054154854, 0.074771754, 0.069714105, 0.074808569,
0.065976686, 0.210223146, 0.182521796, 0.165152821, 0.169255817,
0.191365261, 0.15742073, 0.176432723, 0.169991392, 0.16648548
)), row.names = c(NA, 41L), class = "data.frame")
Since you already know how to do this for one dataframe with a single country, you are very close to your solution. But to make this easy on yourself, you need to do a few things.
Create a reproducible example using dput. The janitor
library has the clean_names() function to fix columns names.
Write your own interpolation function that takes a dataframe with one country as the input, and returns an interpolated dataframe for one country.
Pivot_longer to get all the data columns into a one parameterized column.
Use the dplyr
function group_split to take your large multicountry dataframe, and break it into a list of dataframes, one for each country and parameter.
Use the purrr
function map to map each of the dataframes in the list to a new list of interpolate dataframes.
Use dplyr's bind_rows to convert the list interpolated dataframes back into one dataframe, and pivot_wider to get your original data shape back.
library(tidyverse)
library(purrr)
library(janitor)
my_country_interpolater<-function(single_country_df){
data_to_build_model<-single_country_df %>%
filter(!is.na(value)) %>%
select(year,value)
years_to_interpolate<-single_country_df %>%
filter(is.na(value)) %>%
select(year)
fit<-lm(value ~ year, data = data_to_build_model)
value = predict(fit,years_to_interpolate)
interpolated_data<-tibble(years_to_interpolate, value)
single_country_interpolated_df<-bind_rows(data_to_build_model,interpolated_data) %>%
mutate(country_code=single_country_df$country_code[1]) %>%
mutate(parameter=single_country_df$parameter[1]) %>% # added this for the additional parameters
select(country_code, year, parameter, value) %>%
arrange(year)
return (single_country_interpolated_df)
}
interpolated_df <-sampledata2 %>%
clean_names() %>%
pivot_longer(cols=c(3:5),names_to = "parameter", values_to="value") %>%
group_by(country_code,parameter) %>%
group_split() %>%
# map(preprocess_data) %>% if you need a preprocessing step
map(my_country_interpolater) %>%
bind_rows() %>%
pivot_wider(names_from = parameter, values_from=value, names_glue = "{parameter}_interp")