I have a question that was asked here several years ago without an answer. It would seem that when fitting a model with svyglm
, NAs are omitted, irrespective of whether na.action
is set or not. This matters also when trying to use the predict
function to create new columns in existing data.frames.
To reiterate the same example given in the old post:
library('survey')
y = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1)
x1 = c(1, 0, 1, 1, 1, 1, 2, 2, 2, 2, 1, 0, 0, 1, 0, 0, 0, 2, 2, 2)
x2 = c(10, 21, 33, 55, 40, 30, 26, 84, NA, 87, 20, 21, 23, 25, NA, 60, 76, 84, 71, 87)
x3 = runif(20)
foo = data.frame(y, x1, x2, x3)
m1 = glm(y ~ x1 + x2,
family = binomial(logit),
na.action = na.exclude)
svy1 = svydesign(ids = ~ 0,
data = foo,
weights = ~ x3)
m2 <- svyglm(y ~ x1 + x2,
design = svy1,
family = binomial(logit),
na.action = na.exclude)
predict(m1)
predict(m2)
foo2 = foo
foo2$x2 = runif(nrow(foo), 15, 50)
foo2$x2[c(8, 15, 19)] = NA
predict(m1, newdata = foo2)
predict(m2, newdata = foo2)
predict.glm
here produces predictions that include missing values, while predict.svyglm
does not. Am I missing something?
The reason is, that survey:::predict.svyglm
(where predict
dispatches to) has no na.action
argument, while stats:::predict.glm
does. So obviously na.action
is ignored in the former (see source code) and NA
is removed by default.
> args(survey:::predict.svyglm)
function (object, newdata = NULL, total = NULL, type = c("link",
"response", "terms"), se.fit = (type != "terms"), vcov = FALSE,
...)
NULL
> args(stats:::predict.glm)
function (object, newdata = NULL, type = c("link", "response",
"terms"), se.fit = FALSE, dispersion = NULL, terms = NULL,
na.action = na.pass, ...)
NULL
You could rewrite the method by implementing na.action
in it, e.g. this way:
predict.svyglm <- function(object, newdata=NULL, total=NULL, type=c("link", "response",
"terms"), se.fit=(type != "terms"), vcov=FALSE, na.action = na.pass, ...) {
type <- match.arg(type)
if (is.null(newdata)) {
newdata <- model.frame(object$survey.design)
na.act <- object$na.action
} else {
newdata <- na.action(newdata[attr(object$terms, 'term.labels')])
na.act <- attr(newdata, 'na.action')
}
object$na.action <- NULL
if (type == "terms") {
eta <- survey:::predterms(object, se=se.fit, ...)
if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
eta <- stats:::napredict.exclude(na.act, eta)
}
return(eta)
}
tt <- delete.response(terms(formula(object)))
mf <- model.frame(tt, data=newdata, xlev=object$xlevels)
mm <- model.matrix(tt, mf, contrasts.arg=object$contrasts)
if (!is.null(total) && attr(tt, "intercept")) {
mm[, attr(tt, "intercept")] <- mm[, attr(tt, "intercept")] *
total
}
eta <- drop(mm %*% coef(object))
d <- drop(object$family$mu.eta(eta))
eta <- switch(type, link=eta, response=object$family$linkinv(eta))
if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
eta <- stats:::napredict.exclude(na.act, eta)
}
if (se.fit) {
if (vcov) {
vv <- mm %*% vcov(object) %*% t(mm)
if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
vv <- stats:::napredict.exclude(na.act, vv)
vv <- t(stats:::napredict.exclude(na.act, t(vv)))
d <- stats:::napredict.exclude(na.act, d)
}
attr(eta, "var") <- switch(type, link=vv, response=d *
(t(vv * d)))
} else {
vv <- drop(rowSums((mm %*% vcov(object)) * mm))
if (identical(attr(na.act, 'class'), 'exclude') && !is.null(na.act)) {
vv <- stats:::napredict.exclude(na.act, vv)
}
attr(eta, "var") <- switch(type, link=vv, response=drop(d * t(vv * d)))
}
}
attr(eta, "statistic") <- type
class(eta) <- "svystat"
eta
}
library(survey)
svy1 <- svydesign(ids=~ 0, data=foo, weights=~ x3)
## na.omit
predict(object <- svyglm(y ~ x1 + x2, design=svy1, family='binomial', na.action=na.omit),
newdata=foo)
# link SE
# 1 -2.74783 1.3374
# 2 -6.01717 2.4207
# 3 -1.50677 1.0252
# 4 -0.31968 1.4078
# 5 -1.12906 1.0833
# 6 -1.66865 1.0237
# 7 1.97841 1.5724
# 8 5.10803 1.6053
# 10 5.26990 1.6985
# 11 -2.20824 1.1194
# 12 -6.01717 2.4207
# 13 -5.90926 2.4398
# 14 -1.93845 1.0532
# 16 -3.91277 3.1713
# 17 -3.04943 3.6378
# 18 5.10803 1.6053
# 19 4.40656 1.2596
# 20 5.26990 1.6985
# Warning message:
# In eval(family$initialize) : non-integer #successes in a binomial glm!
## na.exclude
predict(svyglm(y ~ x1 + x2, design=svy1, family='binomial', na.action=na.exclude),
newdata=foo)
# response SE
# 1 0.0602093 0.0757
# 2 0.0024306 0.0059
# 3 0.1814174 0.1522
# 4 0.4207545 0.3431
# 5 0.2443344 0.2000
# 6 0.1586042 0.1366
# 7 0.8785111 0.1678
# 8 0.9939883 0.0096
# 9 NA NA
# 10 0.9948822 0.0086
# 11 0.0990130 0.0999
# 12 0.0024306 0.0059
# 13 0.0027069 0.0066
# 14 0.1258188 0.1158
# 15 NA NA
# 16 0.0195934 0.0609
# 17 0.0452420 0.1571
# 18 0.9939883 0.0096
# 19 0.9879499 0.0150
# 20 0.9948822 0.0086
# Warning message:
# In eval(family$initialize) : non-integer #successes in a binomial glm!
## na.exclude in predict.svyglm
predict(svyglm(y ~ x1 + x2, design=svy1, family='binomial'),
newdata=foo_NA, na.action=na.exclude)
# response SE
# 1 0.999765 0.0013
# 2 0.999995 0.0000
# 3 0.999995 0.0000
# 4 0.999995 0.0000
# 5 NA NA
# 6 0.643391 0.5614
# 7 0.999763 0.0013
# 8 NA NA
# 9 0.643010 0.5619
# 10 0.036086 0.0566
# 11 0.641770 0.5636
# 12 0.988457 0.0453
# 13 0.645963 0.5578
# 14 0.999754 0.0014
# 15 NA NA
# 16 0.640123 0.5659
# 17 0.999995 0.0000
# 18 0.999759 0.0014
# 19 NA NA
# 20 0.999760 0.0014
# 21 0.641986 0.5633
# 22 0.644476 0.5599
# 23 0.988718 0.0441
# 24 0.037487 0.0579
# 25 0.999995 0.0000
# Warning message:
# In eval(family$initialize) : non-integer #successes in a binomial glm!
Data:
foo <- structure(list(y = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1), x1 = c(1, 0, 1, 1, 1, 1, 2, 2, 2, 2, 1, 0,
0, 1, 0, 0, 0, 2, 2, 2), x2 = c(10, 21, 33, 55, 40, 30, 26, 84,
NA, 87, 20, 21, 23, 25, NA, 60, 76, 84, 71, 87), x3 = c(0.560080145020038,
0.260075693251565, 0.0398760288953781, 0.508186420192942, 0.496490396093577,
0.372899192618206, 0.57026850595139, 0.968372587347403, 0.0817912963684648,
0.722660716157407, 0.78440785780549, 0.0374867296777666, 0.201261537149549,
0.332104755565524, 0.566964805591851, 0.010931215249002, 0.0853019708301872,
0.46296559041366, 0.449913740158081, 0.759272540919483)), class = "data.frame", row.names = c(NA,
-20L))
foo_NA <- structure(list(y = c(0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0), x1 = c(4L, 5L, 5L, 5L, 4L,
2L, 4L, 3L, 2L, 1L, 2L, 3L, 2L, 4L, 4L, 2L, 5L, 4L, 5L, 4L, 2L,
2L, 3L, 1L, 5L), x2 = c(0.982817197917029, 0.759544267551973,
0.566488424083218, 0.849689718568698, NA, 0.271286614704877,
0.828158485237509, NA, 0.240544739644974, 0.0429887960199267,
0.140479094116017, 0.21638541505672, 0.479398564202711, 0.197410342283547,
NA, 0.00788473873399198, 0.375489964615554, 0.514407708309591,
NA, 0.581604002509266, 0.157905208179727, 0.359028305858374,
0.645631878403947, 0.775823362637311, 0.563646841561422), x3 = c(0.233703398611397,
0.0899805163498968, 0.0856120649259537, 0.305218369467184, 0.667426514672115,
0.000238896580412984, 0.208569956943393, 0.933034127345309, 0.925644748611376,
0.734094301005825, 0.333071983419359, 0.515063329832628, 0.743974646320567,
0.619159240042791, 0.62624534452334, 0.217157698236406, 0.216567310970277,
0.388945028651506, 0.942455691983923, 0.962608013767749, 0.73985527921468,
0.73324590572156, 0.535761289997026, 0.00227296608500183, 0.608937452547252
)), row.names = c(NA, -25L), class = "data.frame")