Search code examples
rpredictionglmnetmarginal-effects

How does predict.cv.glmnet() calculate the link value for binomial models?


I'm thinking this is a coding error on my part, but I can't figure out what I'm doing wrong. I'm trying to create marginal effects plots for x when x+x^2 are in a binomial glmnet model. When I made predictions using predict.cv.glmnet() on a new dataset it almost seemed as if the the signs of the model coefficients were switched. So I've been comparing predictions when hardcoding the model formula vs. the predict function. I'm getting different predictions on the new data but not the training data and I'm not sure why.

#############example############
library(tidyverse)
library(glmnet)
data(BinomialExample)
#head(x)
#head(y)


#training data
squareit<-function(x){x^2}
x%>%
  as.data.frame()%>%
  dplyr::select(V11, V23, V30)%>%#select three variables
  mutate_all(list(SQ = squareit))%>%#square them
  as.matrix()%>%{.->>x1}#back to matrix
# x1[1:5,]
#             V11         V23        V30    V11_SQ       V23_SQ    V30_SQ
# [1,]  0.5706039  0.01473546  0.9080937 0.3255888 0.0002171337 0.8246342
# [2,]  0.4138668 -0.81898245 -2.0492817 0.1712857 0.6707322586 4.1995554
# [3,] -0.4876853  0.03927982 -1.2089006 0.2378369 0.0015429039 1.4614407
# [4,]  0.4644753 -0.98728378  1.7796744 0.2157373 0.9747292699 3.1672408
# [5,] -0.5239595 -0.13288456 -1.1970731 0.2745336 0.0176583076 1.4329841

#example model + coefficients
set.seed(4484)
final.glmnet<-cv.glmnet(x1, y, type.measure="auc",alpha=1,family="binomial")
coef(final.glmnet,s="lambda.min")#print model coefficients
# (Intercept)  0.4097381
# V11         -0.4487949
# V23          0.3322128
# V30         -0.2924600
# V11_SQ      -0.3334376
# V23_SQ      -0.2867963
# V30_SQ       0.3864748

#get model formula - I'm sure there is a better way
mc<-data.frame(as.matrix(coef(final.glmnet,s="lambda.min")))%>%
  rownames_to_column(.,var="rowname")%>%mutate(rowname=recode(rowname,`(Intercept)`="1"))
paste("(",mc$rowname,"*",mc$X1,")",sep="",collapse = "+")
#model formula
# (1*0.409738094215867)+(V11*-0.44879489356345)+(V23*0.332212780157358)+
# (V30*-0.292459974168585)+(V11_SQ*-0.333437624504966)+
# (V23_SQ*-0.286796292157334)+(V30_SQ*0.386474813542322)

#####predict on training data -- same predictions!
#1 predict using predict.cv.glmnet()
pred_cv.glmnet<-predict(final.glmnet, s=final.glmnet$lambda.min, newx=x1[1:5,], type='link')
round(pred_cv.glmnet,3)
#2 predict using model formula
pred.model.formula<-data.frame(x1[1:5,])%>%
  mutate(link=(1*0.409738094215867)+(V11*-0.44879489356345)+
           (V23*0.332212780157358)+(V30*-0.292459974168585)+
           (V11_SQ*-0.333437624504966)+(V23_SQ*-0.286796292157334)+
           (V30_SQ*0.386474813542322))%>%
  dplyr::select(link)
round(pred.model.formula,3)
# 1         0.103
# 2         1.925
# 3         1.480
# 4         0.225
# 5         1.408

#new data - set up for maringal effects plot
colnames(x1)
test<-data.frame(V23=seq(min(x1[,2]),max(x1[,2]),0.1))%>%#let V23 vary from min-max by .1
  mutate(V23_SQ=V23^2, V11=0, V11_SQ=0, V30=0, V30_SQ=0)%>%#square V23 and set others to 0
as.matrix()
test[1:5,]
# > test[1:5,]
#           V23   V23_SQ V11 V11_SQ V30 V30_SQ
# [1,] -2.071573 4.291414   0      0   0      0
# [2,] -1.971573 3.887100   0      0   0      0
# [3,] -1.871573 3.502785   0      0   0      0
# [4,] -1.771573 3.138470   0      0   0      0
# [5,] -1.671573 2.794156   0      0   0      0

#####predict on new data -- different predictions!
#1 predict using predict.cv.glmnet()
pred_cv.glmnet<-predict(final.glmnet, s=final.glmnet$lambda.min, newx=test[1:5,], type='link')
round(pred_cv.glmnet,3)
# [1,] 2.765
# [2,] 2.586
# [3,] 2.413
# [4,] 2.247
# [5,] 2.088

#2 predict using model formula
pred.model.formula<-data.frame(test[1:5,])%>%
  mutate(link.longhand=(1*0.409738094215867)+(V11*-0.44879489356345)+
           (V23*0.332212780157358)+(V30*-0.292459974168585)+
           (V11_SQ*-0.333437624504966)+(V23_SQ*-0.286796292157334)+
           (V30_SQ*0.386474813542322))%>%
  dplyr::select(link.longhand)
round(pred.model.formula,3)
# 1        -1.509
# 2        -1.360
# 3        -1.217
# 4        -1.079
# 5        -0.947

Solution

  • Your understanding of how the calculation works appears to be correct. However, the predict function requires the newx matrix to have the same dimensions as the original (i.e. columns need to be in the same order). When the columns are mixed up the predict function results will go haywire, as you see in your example.

    If you add a quick adjustment to put your test matrix in the same order as your x1 matrix (handy shortcut is to just use the dimnames from your original matrix in a dplyr::select like line 3 below)...

    test <- data.frame(V23 = seq(min(x1[, 2]), max(x1[, 2]), 0.1)) %>% #let V23 vary from min-max by .1
        mutate(V23_SQ = V23^2, V11 = 0, V11_SQ = 0, V30 = 0, V30_SQ = 0) %>% #square V23 and set others to 0
        select(dimnames(x1)[[2]]) %>% #use the dimnames from x1 to select test in proper order!
        as.matrix()
    

    ...you will find that you get the proper results which align as expected.

    #1 predict using predict.cv.glmnet()
    pred_cv.glmnet <- predict(final.glmnet, s = 'lambda.min', newx = test[1:5,], type = 'link')
    round(pred_cv.glmnet,3)
    # [1,] -1.509
    # [2,] -1.360
    # [3,] -1.217
    # [4,] -1.079
    # [5,] -0.947
    
    #2 predict using model formula
    pred.model.formula <- data.frame(test[1:5,]) %>%
        mutate(link.longhand = ((1 * 0.409738094215867) +
                   (V11 * -0.44879489356345) +
                   (V23 * 0.332212780157358) +
                   (V30 * -0.292459974168585) +
                   (V11_SQ * -0.333437624504966) +
                   (V23_SQ * -0.286796292157334) +
                   (V30_SQ * 0.386474813542322))) %>%
        dplyr::select(link.longhand)
    round(pred.model.formula, 3)
    # link.longhand
    # 1        -1.509
    # 2        -1.360
    # 3        -1.217
    # 4        -1.079
    # 5        -0.947