I have a dataframe that looks like so:
df = structure(list(Date_Time_GMT_3 =
structure(c(1622552400, 1622553300,1622554200, 1622555100, 1622556000, 1622556900),
class = c("POSIXct","POSIXt"),
tzone = "EST"),
X20819830_R1AR_U_Stationary = c(NA_real_, NA_real_, NA_real_, 16.808, 16.713, 17.753),
X20819742_R1AR_S_Stationary = c(16.903, 16.828, 16.808, NA_real_, NA_real_, NA_real_),
X20822215_R3AR_U_Stationary = c(NA_real_, NA_real_, NA_real_, 13.942, 13.942, 13.846),
X20822215_R3AR_S_Stationary = c(13.942, 13.972, 13.842, NA_real_, NA_real_, NA_real_),
X20874235_R4AR_U_Stationary = c(NA_real_, NA_real_, NA_real_, 14.134, 14.534, 14.404),
X20874235_R4AR_S_Stationary = c(14.23, 14.23, 14.134, NA_real_, NA_real_, NA_real_),
X20874311_F1AR_U_Stationary = c(NA_real_, NA_real_, NA_real_, 15.187, 15.327, 15.567),
X20874311_F1AR_S_Stationary = c(15.282, 15.387, 15.587, NA_real_, NA_real_, NA_real_),
X20817727_F8AR_U = c(15.421, 14.441, 14.631, 14.781, 15.521, 15.821),
X20819742_X1AR_U = c(14.996, 15.996, 14.776, 14.920, 14.870, 14.235),
X20819742_R2AR_U = c(14.781, 15.521, 15.821, NA_real_, NA_real_, NA_real_),
X20817727_R5AR_U = c(NA_real_, NA_real_, NA_real_, 13.942, 13.942, 13.846),
X20817727_R7AR = c(14.23, 14.23, 14.134, NA_real_, NA_real_, NA_real_)),
row.names = c(NA, 6L), class = "data.frame")
Based off results I calculated with linear models I want to predict the missing values in this dataframe. Here is an example of the results I have for the linear models
df_HighR = structure(list(response = c("X20817727_F8AR_U", "X20817727_R5AR_U",
"X20817727_R7AR", "X20819742_R2AR_U", "X20819742_X1AR_U"), predictor = c("X20819742_R1AR_S_Stationary",
"X20822215_R3AR_U_Stationary", "X20874235_R4AR_S_Stationary",
"X20819742_R1AR_S_Stationary", "X20822215_R3AR_U_Stationary"),
r.squared = c(0.859062596478993, 1, 1, 0.993125520793874,
0.995714040802335)), class = c("grouped_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -5L), groups = structure(list(
response = c("X20817727_F8AR_U", "X20817727_R5AR_U", "X20817727_R7AR",
"X20819742_R2AR_U", "X20819742_X1AR_U"), .rows = structure(list(
1L, 2L, 3L, 4L, 5L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE))
Essentially, every column that has NA
needs to be run through the predict.lm()
function, against the column that it matches in the df_HighR
dataframe (ex. Column X20817727_F8AR_U
has NA
values that will predicted by X20819742_R1AR_S_Stationary
I have code that works but I'm wondering if there is a way to simplify it. Code below:
#Make the linear model for best R squared for each mobile logger
model_F8AR = lm(df$`20817727_F8AR_U` ~ df$`20822215_R3AR_Stationary`)
summary(model_F8AR)
model_R2AR = lm(df$`20819742_R2AR_U` ~ df$`20822215_R3AR_Stationary`)
summary(model_R2AR)
model_R5AR = lm(df$`20817727_R5AR_U` ~ df$`20874311_F1AR_Stationary`)
summary(model_R5AR)
model_X1AR = lm(df$`20819742_X1AR_U` ~ df$`20874311_F1AR_Stationary`)
summary(model_X1AR)
########Predict the values for mobile loggers
#F8AR
Predicted_F8AR = predict.lm(model_F8AR,new=as.data.frame(df$`20822215_R3AR_Stationary`), interval="confidence")
Predicted_F8AR = as.data.frame(Predicted_F8AR)
names(Predicted_F8AR)[1] = "F8AR_Predicted"
names(Predicted_F8AR)[2] = "F8AR_lwr"
names(Predicted_F8AR)[3] = "F8AR_upr"
#R2AR
Predicted_R2AR = predict.lm(model_R2AR,new=as.data.frame(df$`20822215_R3AR_Stationary`), interval="confidence")
Predicted_R2AR = as.data.frame(Predicted_R2AR)
names(Predicted_R2AR)[1] = "R2AR_Predicted"
names(Predicted_R2AR)[2] = "R2AR_lwr"
names(Predicted_R2AR)[3] = "R2AR_upr"
#R5AR
Predicted_R5AR = predict.lm(model_R5AR,new=as.data.frame(df$`20874311_F1AR_Stationary`), interval="confidence")
Predicted_R5AR = as.data.frame(Predicted_R5AR)
names(Predicted_R5AR)[1] = "R5AR_Predicted"
names(Predicted_R5AR)[2] = "R5AR_lwr"
names(Predicted_R5AR)[3] = "R5AR_upr"
#X1AR
Predicted_X1AR = predict.lm(model_X1AR,new=as.data.frame(df$`20874311_F1AR_Stationary`), interval="confidence")
Predicted_X1AR = as.data.frame(Predicted_X1AR)
names(Predicted_X1AR)[1] = "X1AR_Predicted"
names(Predicted_X1AR)[2] = "X1AR_lwr"
names(Predicted_X1AR)[3] = "X1AR_upr"
Any ideas of how to clean this up?
We may use map2
to loop over the 'response', 'predictor' from 'df_HighR' dataset, build the lm
, get the prediction as list
columns
library(purrr)
library(dplyr)
out <- df_HighR %>%
ungroup %>%
mutate(Model = map2(response, predictor,
~ lm(reformulate(.y, response = .x), data = df)),
predicted = map2(Model, predictor,
~ as.data.frame(predict.lm(.x, new = df[.y], interval = "confidence"))))
-output
> out
# A tibble: 5 × 5
response predictor r.squared Model predicted
<chr> <chr> <dbl> <list> <list>
1 X20817727_F8AR_U X20819742_R1AR_S_Stationary 0.859 <lm> <df [6 × 3]>
2 X20817727_R5AR_U X20822215_R3AR_U_Stationary 1 <lm> <df [6 × 3]>
3 X20817727_R7AR X20874235_R4AR_S_Stationary 1 <lm> <df [6 × 3]>
4 X20819742_R2AR_U X20819742_R1AR_S_Stationary 0.993 <lm> <df [6 × 3]>
5 X20819742_X1AR_U X20822215_R3AR_U_Stationary 0.996 <lm> <df [6 × 3]>
The output could be unnest
ed
library(tidyr)
out %>%
unnest(predicted)
# A tibble: 30 × 7
response predictor r.squared Model fit lwr upr
<chr> <chr> <dbl> <list> <dbl> <dbl> <dbl>
1 X20817727_F8AR_U X20819742_R1AR_S_Stationary 0.859 <lm> 15.4 11.9 18.8
2 X20817727_F8AR_U X20819742_R1AR_S_Stationary 0.859 <lm> 14.7 12.4 16.9
3 X20817727_F8AR_U X20819742_R1AR_S_Stationary 0.859 <lm> 14.5 11.7 17.2
4 X20817727_F8AR_U X20819742_R1AR_S_Stationary 0.859 <lm> NA NA NA
5 X20817727_F8AR_U X20819742_R1AR_S_Stationary 0.859 <lm> NA NA NA
6 X20817727_F8AR_U X20819742_R1AR_S_Stationary 0.859 <lm> NA NA NA
7 X20817727_R5AR_U X20822215_R3AR_U_Stationary 1 <lm> NA NA NA
8 X20817727_R5AR_U X20822215_R3AR_U_Stationary 1 <lm> NA NA NA
9 X20817727_R5AR_U X20822215_R3AR_U_Stationary 1 <lm> NA NA NA
10 X20817727_R5AR_U X20822215_R3AR_U_Stationary 1 <lm> 13.9 13.9 13.9
# … with 20 more rows