Search code examples
rlinear-regressionpredict

How to run linear models, and use predict function more efficiently?


I have a dataframe that looks like so:

df = structure(list(Date_Time_GMT_3 = 
                      structure(c(1622552400, 1622553300,1622554200, 1622555100, 1622556000, 1622556900), 
                                class = c("POSIXct","POSIXt"), 
                                tzone = "EST"),
                    X20819830_R1AR_U_Stationary = c(NA_real_, NA_real_, NA_real_, 16.808, 16.713, 17.753), 
                    X20819742_R1AR_S_Stationary = c(16.903, 16.828, 16.808, NA_real_, NA_real_, NA_real_), 
                    X20822215_R3AR_U_Stationary = c(NA_real_, NA_real_, NA_real_, 13.942, 13.942, 13.846), 
                    X20822215_R3AR_S_Stationary = c(13.942, 13.972, 13.842, NA_real_, NA_real_, NA_real_), 
                    X20874235_R4AR_U_Stationary = c(NA_real_, NA_real_, NA_real_, 14.134, 14.534, 14.404), 
                    X20874235_R4AR_S_Stationary = c(14.23, 14.23, 14.134, NA_real_, NA_real_, NA_real_), 
                    X20874311_F1AR_U_Stationary = c(NA_real_, NA_real_, NA_real_, 15.187, 15.327, 15.567), 
                    X20874311_F1AR_S_Stationary = c(15.282, 15.387, 15.587, NA_real_, NA_real_, NA_real_), 
                    X20817727_F8AR_U = c(15.421, 14.441, 14.631, 14.781, 15.521, 15.821), 
                    X20819742_X1AR_U = c(14.996, 15.996, 14.776, 14.920, 14.870, 14.235), 
                    X20819742_R2AR_U = c(14.781, 15.521, 15.821, NA_real_, NA_real_, NA_real_), 
                    X20817727_R5AR_U = c(NA_real_, NA_real_, NA_real_, 13.942, 13.942, 13.846), 
                    X20817727_R7AR = c(14.23, 14.23, 14.134, NA_real_, NA_real_, NA_real_)), 
               row.names = c(NA, 6L), class = "data.frame")

Based off results I calculated with linear models I want to predict the missing values in this dataframe. Here is an example of the results I have for the linear models

df_HighR = structure(list(response = c("X20817727_F8AR_U", "X20817727_R5AR_U", 
"X20817727_R7AR", "X20819742_R2AR_U", "X20819742_X1AR_U"), predictor = c("X20819742_R1AR_S_Stationary", 
"X20822215_R3AR_U_Stationary", "X20874235_R4AR_S_Stationary", 
"X20819742_R1AR_S_Stationary", "X20822215_R3AR_U_Stationary"), 
    r.squared = c(0.859062596478993, 1, 1, 0.993125520793874, 
    0.995714040802335)), class = c("grouped_df", "tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -5L), groups = structure(list(
    response = c("X20817727_F8AR_U", "X20817727_R5AR_U", "X20817727_R7AR", 
    "X20819742_R2AR_U", "X20819742_X1AR_U"), .rows = structure(list(
        1L, 2L, 3L, 4L, 5L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -5L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE))

Essentially, every column that has NA needs to be run through the predict.lm() function, against the column that it matches in the df_HighR dataframe (ex. Column X20817727_F8AR_U has NA values that will predicted by X20819742_R1AR_S_Stationary

I have code that works but I'm wondering if there is a way to simplify it. Code below:


#Make the linear model for best R squared for each mobile logger
model_F8AR = lm(df$`20817727_F8AR_U` ~ df$`20822215_R3AR_Stationary`)
summary(model_F8AR)


model_R2AR = lm(df$`20819742_R2AR_U` ~ df$`20822215_R3AR_Stationary`)
summary(model_R2AR)

model_R5AR = lm(df$`20817727_R5AR_U` ~ df$`20874311_F1AR_Stationary`)
summary(model_R5AR)

model_X1AR = lm(df$`20819742_X1AR_U` ~ df$`20874311_F1AR_Stationary`)
summary(model_X1AR)


########Predict the values for mobile loggers
#F8AR
Predicted_F8AR = predict.lm(model_F8AR,new=as.data.frame(df$`20822215_R3AR_Stationary`), interval="confidence")
Predicted_F8AR = as.data.frame(Predicted_F8AR)
names(Predicted_F8AR)[1] = "F8AR_Predicted"
names(Predicted_F8AR)[2] = "F8AR_lwr"
names(Predicted_F8AR)[3] = "F8AR_upr"

#R2AR
Predicted_R2AR = predict.lm(model_R2AR,new=as.data.frame(df$`20822215_R3AR_Stationary`), interval="confidence")
Predicted_R2AR = as.data.frame(Predicted_R2AR)
names(Predicted_R2AR)[1] = "R2AR_Predicted"
names(Predicted_R2AR)[2] = "R2AR_lwr"
names(Predicted_R2AR)[3] = "R2AR_upr"

#R5AR
Predicted_R5AR = predict.lm(model_R5AR,new=as.data.frame(df$`20874311_F1AR_Stationary`), interval="confidence")
Predicted_R5AR = as.data.frame(Predicted_R5AR)
names(Predicted_R5AR)[1] = "R5AR_Predicted"
names(Predicted_R5AR)[2] = "R5AR_lwr"
names(Predicted_R5AR)[3] = "R5AR_upr"


#X1AR
Predicted_X1AR = predict.lm(model_X1AR,new=as.data.frame(df$`20874311_F1AR_Stationary`), interval="confidence")
Predicted_X1AR = as.data.frame(Predicted_X1AR)
names(Predicted_X1AR)[1] = "X1AR_Predicted"
names(Predicted_X1AR)[2] = "X1AR_lwr"
names(Predicted_X1AR)[3] = "X1AR_upr"

Any ideas of how to clean this up?


Solution

  • We may use map2 to loop over the 'response', 'predictor' from 'df_HighR' dataset, build the lm, get the prediction as list columns

    library(purrr)
    library(dplyr)
    out <- df_HighR %>%
         ungroup %>%
          mutate(Model = map2(response, predictor,
         ~ lm(reformulate(.y, response = .x), data = df)), 
        predicted = map2(Model, predictor, 
         ~ as.data.frame(predict.lm(.x, new = df[.y], interval = "confidence"))))
    

    -output

    > out
    # A tibble: 5 × 5
      response         predictor                   r.squared Model  predicted   
      <chr>            <chr>                           <dbl> <list> <list>      
    1 X20817727_F8AR_U X20819742_R1AR_S_Stationary     0.859 <lm>   <df [6 × 3]>
    2 X20817727_R5AR_U X20822215_R3AR_U_Stationary     1     <lm>   <df [6 × 3]>
    3 X20817727_R7AR   X20874235_R4AR_S_Stationary     1     <lm>   <df [6 × 3]>
    4 X20819742_R2AR_U X20819742_R1AR_S_Stationary     0.993 <lm>   <df [6 × 3]>
    5 X20819742_X1AR_U X20822215_R3AR_U_Stationary     0.996 <lm>   <df [6 × 3]>
    

    The output could be unnested

    library(tidyr)
    out %>%
       unnest(predicted)
    # A tibble: 30 × 7
       response         predictor                   r.squared Model    fit   lwr   upr
       <chr>            <chr>                           <dbl> <list> <dbl> <dbl> <dbl>
     1 X20817727_F8AR_U X20819742_R1AR_S_Stationary     0.859 <lm>    15.4  11.9  18.8
     2 X20817727_F8AR_U X20819742_R1AR_S_Stationary     0.859 <lm>    14.7  12.4  16.9
     3 X20817727_F8AR_U X20819742_R1AR_S_Stationary     0.859 <lm>    14.5  11.7  17.2
     4 X20817727_F8AR_U X20819742_R1AR_S_Stationary     0.859 <lm>    NA    NA    NA  
     5 X20817727_F8AR_U X20819742_R1AR_S_Stationary     0.859 <lm>    NA    NA    NA  
     6 X20817727_F8AR_U X20819742_R1AR_S_Stationary     0.859 <lm>    NA    NA    NA  
     7 X20817727_R5AR_U X20822215_R3AR_U_Stationary     1     <lm>    NA    NA    NA  
     8 X20817727_R5AR_U X20822215_R3AR_U_Stationary     1     <lm>    NA    NA    NA  
     9 X20817727_R5AR_U X20822215_R3AR_U_Stationary     1     <lm>    NA    NA    NA  
    10 X20817727_R5AR_U X20822215_R3AR_U_Stationary     1     <lm>    13.9  13.9  13.9
    # … with 20 more rows