Using na.exclude in svyglm in the survey package

I have a question that was asked here several years ago without an answer. It would seem that when fitting a model with svyglm, NAs are omitted, irrespective of whether na.action is set or not. This matters also when trying to use the predict function to create new columns in existing data.frames.

To reiterate the same example given in the old post:

library('survey')
y  = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1)
x1 = c(1, 0, 1, 1, 1, 1, 2, 2, 2, 2, 1, 0, 0, 1, 0, 0, 0, 2, 2, 2)
x2 = c(10, 21, 33, 55, 40, 30, 26, 84, NA, 87, 20, 21, 23, 25, NA, 60, 76, 84, 71, 87)
x3 = runif(20)
foo = data.frame(y, x1, x2, x3)

m1 = glm(y ~ x1 + x2, 
         family = binomial(logit), 
         na.action = na.exclude)

svy1 = svydesign(ids = ~ 0,
                 data = foo,
                 weights = ~ x3)

m2 <- svyglm(y ~ x1 + x2, 
             design = svy1, 
             family = binomial(logit), 
             na.action = na.exclude)

predict(m1)
predict(m2)

foo2 = foo
foo2$x2 = runif(nrow(foo), 15, 50)
foo2$x2[c(8, 15, 19)] = NA

predict(m1, newdata = foo2)
predict(m2, newdata = foo2)

predict.glm here produces predictions that include missing values, while predict.svyglm does not. Am I missing something?

Solution

The reason is, that survey:::predict.svyglm (where predict dispatches to) has no na.action argument, while stats:::predict.glm does. So obviously na.action is ignored in the former (see source code) and NA is removed by default.

> args(survey:::predict.svyglm)
function (object, newdata = NULL, total = NULL, type = c("link", 
    "response", "terms"), se.fit = (type != "terms"), vcov = FALSE, 
    ...) 
NULL

> args(stats:::predict.glm)
function (object, newdata = NULL, type = c("link", "response", 
    "terms"), se.fit = FALSE, dispersion = NULL, terms = NULL, 
    na.action = na.pass, ...) 
NULL

You could rewrite the method by implementing na.action in it, e.g. this way:

predict.svyglm <- function(object, newdata=NULL, total=NULL, type=c("link", "response", 
                      "terms"), se.fit=(type != "terms"), vcov=FALSE, na.action = na.pass, ...) {
  type <- match.arg(type)
  if (is.null(newdata)) {
    newdata <- model.frame(object$survey.design)
    na.act <- object$na.action
  } else {
    newdata <- na.action(newdata[attr(object$terms, 'term.labels')])
    na.act <- attr(newdata, 'na.action')
  }
  object$na.action <- NULL
  if (type == "terms") {
    eta <- survey:::predterms(object, se=se.fit, ...)
    if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
      eta <- stats:::napredict.exclude(na.act, eta)
    }
    return(eta)
  }
  tt <- delete.response(terms(formula(object)))
  mf <- model.frame(tt, data=newdata, xlev=object$xlevels)
  mm <- model.matrix(tt, mf, contrasts.arg=object$contrasts)
  if (!is.null(total) && attr(tt, "intercept")) {
    mm[, attr(tt, "intercept")] <- mm[, attr(tt, "intercept")] * 
      total
  }
  eta <- drop(mm %*% coef(object))
  d <- drop(object$family$mu.eta(eta))
  eta <- switch(type, link=eta, response=object$family$linkinv(eta))
  if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
    eta <- stats:::napredict.exclude(na.act, eta)
  }
  if (se.fit) {
    if (vcov) {
      vv <- mm %*% vcov(object) %*% t(mm)
      if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
        vv <- stats:::napredict.exclude(na.act, vv)
        vv <- t(stats:::napredict.exclude(na.act, t(vv)))
        d <- stats:::napredict.exclude(na.act, d)
      }
      attr(eta, "var") <- switch(type, link=vv, response=d * 
                                   (t(vv * d)))
    } else {
      vv <- drop(rowSums((mm %*% vcov(object)) * mm))
      if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
        vv <- stats:::napredict.exclude(na.act, vv)
      } 
      attr(eta, "var") <- switch(type, link=vv, response=drop(d * t(vv * d)))
    }
  }
  attr(eta, "statistic") <- type
  class(eta) <- "svystat"
  eta
}

Usage

library(survey)

svy1 <- svydesign(ids=~ 0, data=foo, weights=~ x3)

## na.omit
predict(object <- svyglm(y ~ x1 + x2, design=svy1, family='binomial', na.action=na.omit),
        newdata=foo) 
#        link     SE
# 1  -2.74783 1.3374
# 2  -6.01717 2.4207
# 3  -1.50677 1.0252
# 4  -0.31968 1.4078
# 5  -1.12906 1.0833
# 6  -1.66865 1.0237
# 7   1.97841 1.5724
# 8   5.10803 1.6053
# 10  5.26990 1.6985
# 11 -2.20824 1.1194
# 12 -6.01717 2.4207
# 13 -5.90926 2.4398
# 14 -1.93845 1.0532
# 16 -3.91277 3.1713
# 17 -3.04943 3.6378
# 18  5.10803 1.6053
# 19  4.40656 1.2596
# 20  5.26990 1.6985
# Warning message:
#   In eval(family$initialize) : non-integer #successes in a binomial glm!

## na.exclude
predict(svyglm(y ~ x1 + x2, design=svy1, family='binomial', na.action=na.exclude), 
               newdata=foo)
#     response     SE
# 1  0.0602093 0.0757
# 2  0.0024306 0.0059
# 3  0.1814174 0.1522
# 4  0.4207545 0.3431
# 5  0.2443344 0.2000
# 6  0.1586042 0.1366
# 7  0.8785111 0.1678
# 8  0.9939883 0.0096
# 9         NA     NA
# 10 0.9948822 0.0086
# 11 0.0990130 0.0999
# 12 0.0024306 0.0059
# 13 0.0027069 0.0066
# 14 0.1258188 0.1158
# 15        NA     NA
# 16 0.0195934 0.0609
# 17 0.0452420 0.1571
# 18 0.9939883 0.0096
# 19 0.9879499 0.0150
# 20 0.9948822 0.0086
# Warning message:
#   In eval(family$initialize) : non-integer #successes in a binomial glm!

## na.exclude in predict.svyglm

predict(svyglm(y ~ x1 + x2, design=svy1, family='binomial'), 
               newdata=foo_NA, na.action=na.exclude)
#    response     SE
# 1  0.999765 0.0013
# 2  0.999995 0.0000
# 3  0.999995 0.0000
# 4  0.999995 0.0000
# 5        NA     NA
# 6  0.643391 0.5614
# 7  0.999763 0.0013
# 8        NA     NA
# 9  0.643010 0.5619
# 10 0.036086 0.0566
# 11 0.641770 0.5636
# 12 0.988457 0.0453
# 13 0.645963 0.5578
# 14 0.999754 0.0014
# 15       NA     NA
# 16 0.640123 0.5659
# 17 0.999995 0.0000
# 18 0.999759 0.0014
# 19       NA     NA
# 20 0.999760 0.0014
# 21 0.641986 0.5633
# 22 0.644476 0.5599
# 23 0.988718 0.0441
# 24 0.037487 0.0579
# 25 0.999995 0.0000
# Warning message:
#   In eval(family$initialize) : non-integer #successes in a binomial glm!

Data:

foo <- structure(list(y = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 
0, 1, 1, 1, 1, 1), x1 = c(1, 0, 1, 1, 1, 1, 2, 2, 2, 2, 1, 0, 
0, 1, 0, 0, 0, 2, 2, 2), x2 = c(10, 21, 33, 55, 40, 30, 26, 84, 
NA, 87, 20, 21, 23, 25, NA, 60, 76, 84, 71, 87), x3 = c(0.560080145020038, 
0.260075693251565, 0.0398760288953781, 0.508186420192942, 0.496490396093577, 
0.372899192618206, 0.57026850595139, 0.968372587347403, 0.0817912963684648, 
0.722660716157407, 0.78440785780549, 0.0374867296777666, 0.201261537149549, 
0.332104755565524, 0.566964805591851, 0.010931215249002, 0.0853019708301872, 
0.46296559041366, 0.449913740158081, 0.759272540919483)), class = "data.frame", row.names = c(NA, 
-20L))


foo_NA <- structure(list(y = c(0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 
0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0), x1 = c(4L, 5L, 5L, 5L, 4L, 
2L, 4L, 3L, 2L, 1L, 2L, 3L, 2L, 4L, 4L, 2L, 5L, 4L, 5L, 4L, 2L, 
2L, 3L, 1L, 5L), x2 = c(0.982817197917029, 0.759544267551973, 
0.566488424083218, 0.849689718568698, NA, 0.271286614704877, 
0.828158485237509, NA, 0.240544739644974, 0.0429887960199267, 
0.140479094116017, 0.21638541505672, 0.479398564202711, 0.197410342283547, 
NA, 0.00788473873399198, 0.375489964615554, 0.514407708309591, 
NA, 0.581604002509266, 0.157905208179727, 0.359028305858374, 
0.645631878403947, 0.775823362637311, 0.563646841561422), x3 = c(0.233703398611397, 
0.0899805163498968, 0.0856120649259537, 0.305218369467184, 0.667426514672115, 
0.000238896580412984, 0.208569956943393, 0.933034127345309, 0.925644748611376, 
0.734094301005825, 0.333071983419359, 0.515063329832628, 0.743974646320567, 
0.619159240042791, 0.62624534452334, 0.217157698236406, 0.216567310970277, 
0.388945028651506, 0.942455691983923, 0.962608013767749, 0.73985527921468, 
0.73324590572156, 0.535761289997026, 0.00227296608500183, 0.608937452547252
)), row.names = c(NA, -25L), class = "data.frame")