Search code examples
rpredictsurvey

Using na.exclude in svyglm in the survey package


I have a question that was asked here several years ago without an answer. It would seem that when fitting a model with svyglm, NAs are omitted, irrespective of whether na.action is set or not. This matters also when trying to use the predict function to create new columns in existing data.frames.

To reiterate the same example given in the old post:

library('survey')
y  = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1)
x1 = c(1, 0, 1, 1, 1, 1, 2, 2, 2, 2, 1, 0, 0, 1, 0, 0, 0, 2, 2, 2)
x2 = c(10, 21, 33, 55, 40, 30, 26, 84, NA, 87, 20, 21, 23, 25, NA, 60, 76, 84, 71, 87)
x3 = runif(20)
foo = data.frame(y, x1, x2, x3)

m1 = glm(y ~ x1 + x2, 
         family = binomial(logit), 
         na.action = na.exclude)

svy1 = svydesign(ids = ~ 0,
                 data = foo,
                 weights = ~ x3)

m2 <- svyglm(y ~ x1 + x2, 
             design = svy1, 
             family = binomial(logit), 
             na.action = na.exclude)

predict(m1)
predict(m2)

foo2 = foo
foo2$x2 = runif(nrow(foo), 15, 50)
foo2$x2[c(8, 15, 19)] = NA

predict(m1, newdata = foo2)
predict(m2, newdata = foo2)

predict.glm here produces predictions that include missing values, while predict.svyglm does not. Am I missing something?


Solution

  • The reason is, that survey:::predict.svyglm (where predict dispatches to) has no na.action argument, while stats:::predict.glm does. So obviously na.action is ignored in the former (see source code) and NA is removed by default.

    > args(survey:::predict.svyglm)
    function (object, newdata = NULL, total = NULL, type = c("link", 
        "response", "terms"), se.fit = (type != "terms"), vcov = FALSE, 
        ...) 
    NULL
    
    > args(stats:::predict.glm)
    function (object, newdata = NULL, type = c("link", "response", 
        "terms"), se.fit = FALSE, dispersion = NULL, terms = NULL, 
        na.action = na.pass, ...) 
    NULL
    

    You could rewrite the method by implementing na.action in it, e.g. this way:

    predict.svyglm <- function(object, newdata=NULL, total=NULL, type=c("link", "response", 
                          "terms"), se.fit=(type != "terms"), vcov=FALSE, na.action = na.pass, ...) {
      type <- match.arg(type)
      if (is.null(newdata)) {
        newdata <- model.frame(object$survey.design)
        na.act <- object$na.action
      } else {
        newdata <- na.action(newdata[attr(object$terms, 'term.labels')])
        na.act <- attr(newdata, 'na.action')
      }
      object$na.action <- NULL
      if (type == "terms") {
        eta <- survey:::predterms(object, se=se.fit, ...)
        if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
          eta <- stats:::napredict.exclude(na.act, eta)
        }
        return(eta)
      }
      tt <- delete.response(terms(formula(object)))
      mf <- model.frame(tt, data=newdata, xlev=object$xlevels)
      mm <- model.matrix(tt, mf, contrasts.arg=object$contrasts)
      if (!is.null(total) && attr(tt, "intercept")) {
        mm[, attr(tt, "intercept")] <- mm[, attr(tt, "intercept")] * 
          total
      }
      eta <- drop(mm %*% coef(object))
      d <- drop(object$family$mu.eta(eta))
      eta <- switch(type, link=eta, response=object$family$linkinv(eta))
      if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
        eta <- stats:::napredict.exclude(na.act, eta)
      }
      if (se.fit) {
        if (vcov) {
          vv <- mm %*% vcov(object) %*% t(mm)
          if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
            vv <- stats:::napredict.exclude(na.act, vv)
            vv <- t(stats:::napredict.exclude(na.act, t(vv)))
            d <- stats:::napredict.exclude(na.act, d)
          }
          attr(eta, "var") <- switch(type, link=vv, response=d * 
                                       (t(vv * d)))
        } else {
          vv <- drop(rowSums((mm %*% vcov(object)) * mm))
          if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
            vv <- stats:::napredict.exclude(na.act, vv)
          } 
          attr(eta, "var") <- switch(type, link=vv, response=drop(d * t(vv * d)))
        }
      }
      attr(eta, "statistic") <- type
      class(eta) <- "svystat"
      eta
    }
    

    Usage

    library(survey)
    
    svy1 <- svydesign(ids=~ 0, data=foo, weights=~ x3)
    
    ## na.omit
    predict(object <- svyglm(y ~ x1 + x2, design=svy1, family='binomial', na.action=na.omit),
            newdata=foo) 
    #        link     SE
    # 1  -2.74783 1.3374
    # 2  -6.01717 2.4207
    # 3  -1.50677 1.0252
    # 4  -0.31968 1.4078
    # 5  -1.12906 1.0833
    # 6  -1.66865 1.0237
    # 7   1.97841 1.5724
    # 8   5.10803 1.6053
    # 10  5.26990 1.6985
    # 11 -2.20824 1.1194
    # 12 -6.01717 2.4207
    # 13 -5.90926 2.4398
    # 14 -1.93845 1.0532
    # 16 -3.91277 3.1713
    # 17 -3.04943 3.6378
    # 18  5.10803 1.6053
    # 19  4.40656 1.2596
    # 20  5.26990 1.6985
    # Warning message:
    #   In eval(family$initialize) : non-integer #successes in a binomial glm!
    

    ## na.exclude
    predict(svyglm(y ~ x1 + x2, design=svy1, family='binomial', na.action=na.exclude), 
                   newdata=foo)
    #     response     SE
    # 1  0.0602093 0.0757
    # 2  0.0024306 0.0059
    # 3  0.1814174 0.1522
    # 4  0.4207545 0.3431
    # 5  0.2443344 0.2000
    # 6  0.1586042 0.1366
    # 7  0.8785111 0.1678
    # 8  0.9939883 0.0096
    # 9         NA     NA
    # 10 0.9948822 0.0086
    # 11 0.0990130 0.0999
    # 12 0.0024306 0.0059
    # 13 0.0027069 0.0066
    # 14 0.1258188 0.1158
    # 15        NA     NA
    # 16 0.0195934 0.0609
    # 17 0.0452420 0.1571
    # 18 0.9939883 0.0096
    # 19 0.9879499 0.0150
    # 20 0.9948822 0.0086
    # Warning message:
    #   In eval(family$initialize) : non-integer #successes in a binomial glm!
    

    ## na.exclude in predict.svyglm
    
    predict(svyglm(y ~ x1 + x2, design=svy1, family='binomial'), 
                   newdata=foo_NA, na.action=na.exclude)
    #    response     SE
    # 1  0.999765 0.0013
    # 2  0.999995 0.0000
    # 3  0.999995 0.0000
    # 4  0.999995 0.0000
    # 5        NA     NA
    # 6  0.643391 0.5614
    # 7  0.999763 0.0013
    # 8        NA     NA
    # 9  0.643010 0.5619
    # 10 0.036086 0.0566
    # 11 0.641770 0.5636
    # 12 0.988457 0.0453
    # 13 0.645963 0.5578
    # 14 0.999754 0.0014
    # 15       NA     NA
    # 16 0.640123 0.5659
    # 17 0.999995 0.0000
    # 18 0.999759 0.0014
    # 19       NA     NA
    # 20 0.999760 0.0014
    # 21 0.641986 0.5633
    # 22 0.644476 0.5599
    # 23 0.988718 0.0441
    # 24 0.037487 0.0579
    # 25 0.999995 0.0000
    # Warning message:
    #   In eval(family$initialize) : non-integer #successes in a binomial glm!
    

    Data:

    foo <- structure(list(y = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 
    0, 1, 1, 1, 1, 1), x1 = c(1, 0, 1, 1, 1, 1, 2, 2, 2, 2, 1, 0, 
    0, 1, 0, 0, 0, 2, 2, 2), x2 = c(10, 21, 33, 55, 40, 30, 26, 84, 
    NA, 87, 20, 21, 23, 25, NA, 60, 76, 84, 71, 87), x3 = c(0.560080145020038, 
    0.260075693251565, 0.0398760288953781, 0.508186420192942, 0.496490396093577, 
    0.372899192618206, 0.57026850595139, 0.968372587347403, 0.0817912963684648, 
    0.722660716157407, 0.78440785780549, 0.0374867296777666, 0.201261537149549, 
    0.332104755565524, 0.566964805591851, 0.010931215249002, 0.0853019708301872, 
    0.46296559041366, 0.449913740158081, 0.759272540919483)), class = "data.frame", row.names = c(NA, 
    -20L))
    
    
    foo_NA <- structure(list(y = c(0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 
    0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0), x1 = c(4L, 5L, 5L, 5L, 4L, 
    2L, 4L, 3L, 2L, 1L, 2L, 3L, 2L, 4L, 4L, 2L, 5L, 4L, 5L, 4L, 2L, 
    2L, 3L, 1L, 5L), x2 = c(0.982817197917029, 0.759544267551973, 
    0.566488424083218, 0.849689718568698, NA, 0.271286614704877, 
    0.828158485237509, NA, 0.240544739644974, 0.0429887960199267, 
    0.140479094116017, 0.21638541505672, 0.479398564202711, 0.197410342283547, 
    NA, 0.00788473873399198, 0.375489964615554, 0.514407708309591, 
    NA, 0.581604002509266, 0.157905208179727, 0.359028305858374, 
    0.645631878403947, 0.775823362637311, 0.563646841561422), x3 = c(0.233703398611397, 
    0.0899805163498968, 0.0856120649259537, 0.305218369467184, 0.667426514672115, 
    0.000238896580412984, 0.208569956943393, 0.933034127345309, 0.925644748611376, 
    0.734094301005825, 0.333071983419359, 0.515063329832628, 0.743974646320567, 
    0.619159240042791, 0.62624534452334, 0.217157698236406, 0.216567310970277, 
    0.388945028651506, 0.942455691983923, 0.962608013767749, 0.73985527921468, 
    0.73324590572156, 0.535761289997026, 0.00227296608500183, 0.608937452547252
    )), row.names = c(NA, -25L), class = "data.frame")