Search code examples
rglmsurveyweightingmarginal-effects

How can I generate marginal effects for a logit model when using survey weights?


I normally generate logit model marginal effects using the mfx package and the logitmfx function. However, the current survey I am using has weights (which have a large effect on the proportion of the DV in the sample because of oversampling in some populations) and logitmfx doesn't appear to have any way to include weights.

I have fitted the model with svyglm as follows:

library(survey)

survey.design <- svydesign(ids = combined.survey$id,
                                        weights = combined.survey$weight,
                                            data = combined.survey)

vote.pred.1 <- svyglm(formula = turnout ~ gender + age.group + 
                                    education + income, 
                                 design = survey.design)
summary(vote.pred.1)

How can I generate marginal effects from these results?


Solution

  • I had the same question. Below I modified a function from the mfx package to calculate marginal effects using data organized as a survey object. I didn't do much, mostly replacing 'mean()' and similar commands that are designed to run on non-survey data with the survey-package equivalents. After the modified mfx code, there is code that runs an example.

    Background

    Details on the mfx package by Alan Fernihough: https://cran.r-project.org/web/packages/mfx/mfx.pdf

    Code for the mfx package on github (the files that I modified are probitmfxest.r and probitmfx.r): https://github.com/cran/mfx/tree/master/R

    In the mfx calculator, I commented out a lot of the flexibility built into the original functions that handled different assumptions about clusters and robust SEs. I could be wrong but I think those issues are already accounted for by using the regression estimation command from the survey package, svyglm().

    Marginal effects calculator

     library(survey)
    
     probitMfxEstSurv <-
        function(formula, 
                 design, 
                 atmean = TRUE, 
                 robust = FALSE, 
                 clustervar1 = NULL, 
                 clustervar2 = NULL, 
                 start = NULL
                 #           control = list() # this option is found in the original mfx package
        ){
    
          if(is.null(formula)){
            stop("model formula is missing")
          }
    
          for( i in 1:length(class(design))){
            if(!((class(design)[i] %in% "survey.design2") | (class(design)[i] %in% "survey.design"))){
              stop("design arguement must contain survey object")
            }
          }
    
          # from Fernihough's original mfx function
          # I dont think this is needed because the  
          # regression computed by the survey package should
          # take care of stratification and robust SEs
          # from the survey info
          # 
          #     # cluster sort part
          #     if(is.null(clustervar1) & !is.null(clustervar2)){
          #       stop("use clustervar1 arguement before clustervar2 arguement")
          #     }    
          #     if(!is.null(clustervar1)){
          #       if(is.null(clustervar2)){
          #         if(!(clustervar1 %in% names(data))){
          #           stop("clustervar1 not in data.frame object")
          #         }    
          #         data = data.frame(model.frame(formula, data, na.action=NULL),data[,clustervar1])
          #         names(data)[dim(data)[2]] = clustervar1
          #         data=na.omit(data)
          #       }
          #       if(!is.null(clustervar2)){
          #         if(!(clustervar1 %in% names(data))){
          #           stop("clustervar1 not in data.frame object")
          #         }    
          #         if(!(clustervar2 %in% names(data))){
          #           stop("clustervar2 not in data.frame object")
          #         }    
          #         data = data.frame(model.frame(formula, data, na.action=NULL),
          #                           data[,c(clustervar1,clustervar2)])
          #         names(data)[c(dim(data)[2]-1):dim(data)[2]] = c(clustervar1,clustervar2)
          #         data=na.omit(data)
          #       }
          #     }
    
          # fit the probit regression
          fit = svyglm(formula, 
                       design=design, 
                       family = quasibinomial(link = "probit"), 
                       x=T
          )
          # TS: summary(fit)
    
          # terms needed
          x1 = model.matrix(fit)
          if (any(alias <- is.na(coef(fit)))) {   # this conditional removes any vars with a NA coefficient
            x1 <- x1[, !alias, drop = FALSE]
          }
    
          xm = as.matrix(svymean(x1,design)) # calculate means of x variables
          be = as.matrix(na.omit(coef(fit))) # collect coefficients: be as in beta
          k1 = length(na.omit(coef(fit))) # collect number of coefficients or x variables
          xb = t(xm) %*% be # get the matrix product of xMean and beta, which is the model prediction at the mean
          fxb = ifelse(atmean==TRUE, dnorm(xb), mean(dnorm(x1 %*% be))) # collect either the overall predicted mean, or the average of every observation's predictions
    
          # get variances
          vcv = vcov(fit)
    
          # from Fernihough's original mfx function
          # I dont think this is needed because the  
          # regression computed by the survey package should
          # take care of stratification and robust SEs
          # from the survey info
          # 
          #     if(robust){
          #       if(is.null(clustervar1)){
          #         # white correction
          #         vcv = vcovHC(fit, type = "HC0")
          #       } else {
          #         if(is.null(clustervar2)){
          #           vcv = clusterVCV(data=data, fm=fit, cluster1=clustervar1,cluster2=NULL)
          #         } else {
          #           vcv = clusterVCV(data=data, fm=fit, cluster1=clustervar1,cluster2=clustervar2)
          #         }
          #       }
          #     }
          #     
          #     if(robust==FALSE & is.null(clustervar1)==FALSE){
          #       if(is.null(clustervar2)){
          #         vcv = clusterVCV(data=data, fm=fit, cluster1=clustervar1,cluster2=NULL)
          #       } else {
          #         vcv = clusterVCV(data=data, fm=fit, cluster1=clustervar1,cluster2=clustervar2)
          #       }
          #     }
    
          # set mfx equal to predicted mean (or other value) multiplied by beta
          mfx = data.frame(mfx=fxb*be, se=NA)
    
          # get standard errors
          if(atmean){#    fxb *  id matrix - avg model prediction * (beta X xmean)
            gr = as.numeric(fxb)*(diag(k1) - as.numeric(xb) *(be %*% t(xm)))
            mfx$se = sqrt(diag(gr %*% vcv %*% t(gr)))            
          } else {
            gr = apply(x1, 1, function(x){
              as.numeric(as.numeric(dnorm(x %*% be))*(diag(k1) - as.numeric(x %*% be)*(be %*% t(x))))
            })
            gr = matrix(apply(gr,1,mean),nrow=k1)
            mfx$se = sqrt(diag(gr %*% vcv %*% t(gr)))                
          }
    
          # pick out constant and remove from mfx table
          temp1 = apply(x1,2,function(x)length(table(x))==1)
          const = names(temp1[temp1==TRUE])
          mfx = mfx[row.names(mfx)!=const,]
    
          # pick out discrete change variables
          temp1 = apply(x1,2,function(x)length(table(x))==2)
          disch = names(temp1[temp1==TRUE])
    
          # calculate the disctrete change marginal effects and standard errors
          if(length(disch)!=0){
            for(i in 1:length(disch)){
              if(atmean){
                disx0 = disx1 = xm
                disx1[disch[i],] = max(x1[,disch[i]])
                disx0[disch[i],] = min(x1[,disch[i]])
                # mfx equal to    prediction @ x=1     minus prediction @ x=0
                mfx[disch[i],1] = pnorm(t(be) %*% disx1) - pnorm(t(be) %*% disx0)
                # standard errors
                gr = dnorm(t(be) %*% disx1) %*% t(disx1) - dnorm(t(be) %*% disx0) %*% t(disx0)
                mfx[disch[i],2] = sqrt(gr %*% vcv %*% t(gr))
              } else {
                disx0 = disx1 = x1
                disx1[,disch[i]] = max(x1[,disch[i]])
                disx0[,disch[i]] = min(x1[,disch[i]])  
                mfx[disch[i],1] = mean(pnorm(disx1 %*% be) - pnorm(disx0 %*% be))
                # standard errors
                gr = as.numeric(dnorm(disx1 %*% be)) * disx1 - as.numeric(dnorm(disx0 %*% be)) * disx0
                avegr = as.matrix(colMeans(gr))
                mfx[disch[i],2] = sqrt(t(avegr) %*% vcv %*% avegr)
              }
            }
          } 
          mfx$discretechgvar = ifelse(rownames(mfx) %in% disch, 1, 0)
          output = list(fit=fit, mfx=mfx)
          return(output)
        }
    
    
    
      probitMfxSurv <-
        function(formula, 
                 design, 
                 atmean = TRUE, 
                 robust = FALSE, 
                 clustervar1 = NULL, 
                 clustervar2 = NULL, 
                 start = NULL 
                 #           control = list() # this option is found in original mfx package
        )
        {
          #    res = probitMfxEstSurv(formula, design, atmean, robust, clustervar1, clustervar2, start, control)
          res = probitMfxEstSurv(formula, design, atmean, robust, clustervar1, clustervar2, start)
    
          est = NULL
          est$mfxest = cbind(dFdx = res$mfx$mfx,
                             StdErr = res$mfx$se,
                             z.value = res$mfx$mfx/res$mfx$se,
                             p.value = 2*pt(-abs(res$mfx$mfx/res$mfx$se), df = Inf))
          colnames(est$mfxest) = c("dF/dx","Std. Err.","z","P>|z|")
          rownames(est$mfxest) =  rownames(res$mfx)
    
          est$fit = res$fit
          est$dcvar = rownames(res$mfx[res$mfx$discretechgvar==1,])  
          est$call = match.call() 
          class(est) = "probitmfx"
          est
        }
    

    Example

      # initialize sample data
      nObs = 100
      x1 = rbinom(nObs,1,.5)
      x2 = rbinom(nObs,1,.3)
      #x3 = rbinom(100,1,.9)
      x3 = runif(nObs,0,.9)
    
      id = 1:nObs
      w1 = sample(c(10,50,100),nObs,replace=TRUE)
      #   dependnt variables
      ystar = x1 + x2 - x3 + rnorm(nObs)
      y = ifelse(ystar>0,1,0)
      #   set up data frame
      data = data.frame(id, w1, x1, x2, x3, ystar, y)
    
      # initialize survey
      survey.design <- svydesign(ids = data$id,
                                 weights = data$w1,
                                 data = data)
    
      mean(data$x2)
      sd(data$x2)/(length(data$x2))^0.5
      svymean(x=x2,design=survey.design)
    
      probit = svyglm(y~x1 + x2 + x3, design=survey.design, family=quasibinomial(link='probit'))
      summary(probit)
    
      probitMfxSurv(formula = y~x1 + x2 + x3, design = survey.design)