Search code examples
rconfidence-intervalsurvival-analysis

Is there a way to see the formula that R uses for the survfit confidence intervals?


I want to be able to see how the summary of survfit calculates its confidence intervals. Is there a way that I can ask R to show me how it calculated these to show me the formula?

Thanks :)


Solution

  • You can find the source code like this. First look at survfit:

    > getAnywhere("survfit")
    A single object matching ‘survfit’ was found
    It was found in the following places
      package:survival
      namespace:survival
    with value
    function (formula, ...) 
    {
        UseMethod("survfit", formula)
    }
    <bytecode: 0x000000000edccc88>
    <environment: namespace:survival>
    >
    

    This tells us we have to look at survfit.formula, which is what that UseMethod call is saying. So we do that and we get a lot of code:

    > getAnywhere("survfit.formula")
    A single object matching ‘survfit.formula’ was found
    It was found in the following places
      package:survival
      registered S3 method for survfit from namespace survival
      namespace:survival
    with value
    
    function (formula, data, weights, subset, na.action, etype, id, 
        istate, ...) 
    {
        Call <- match.call()
        Call[[1]] <- as.name("survfit")
        mfnames <- c("formula", "data", "weights", "subset", "na.action", 
            "istate", "id", "etype")
        temp <- Call[c(1, match(mfnames, names(Call), nomatch = 0))]
        temp[[1]] <- as.name("model.frame")
        if (is.R()) 
            m <- eval.parent(temp)
        else m <- eval(temp, sys.parent())
        Terms <- terms(formula, c("strata", "cluster"))
        ord <- attr(Terms, "order")
        if (length(ord) & any(ord != 1)) 
            stop("Interaction terms are not valid for this function")
        n <- nrow(m)
        Y <- model.extract(m, "response")
        if (!is.Surv(Y)) 
            stop("Response must be a survival object")
        casewt <- model.extract(m, "weights")
        if (is.null(casewt)) 
            casewt <- rep(1, n)
        if (!is.null(attr(Terms, "offset"))) 
            warning("Offset term ignored")
        id <- model.extract(m, "id")
        istate <- model.extract(m, "istate")
        temp <- untangle.specials(Terms, "cluster")
        if (length(temp$vars) > 0) {
            if (length(temp$vars) > 1) 
                stop("can not have two cluster terms")
            if (!is.null(id)) 
                stop("can not have both a cluster term and an id variable")
            id <- m[[temp$vars]]
            Terms <- Terms[-temp$terms]
        }
        ll <- attr(Terms, "term.labels")
        if (length(ll) == 0) 
            X <- factor(rep(1, n))
        else X <- strata(m[ll])
        if (!is.Surv(Y)) 
            stop("y must be a Surv object")
        etype <- model.extract(m, "etype")
        if (!is.null(etype)) {
            if (attr(Y, "type") == "mcounting" || attr(Y, "type") == 
                "mright") 
                stop("cannot use both the etype argument and mstate survival type")
            if (length(istate)) 
                stop("cannot use both the etype and istate arguments")
            status <- Y[, ncol(Y)]
            etype <- as.factor(etype)
            temp <- table(etype, status == 0)
            if (all(rowSums(temp == 0) == 1)) {
                newlev <- levels(etype)[order(-temp[, 2])]
            }
            else newlev <- c(" ", levels(etype)[temp[, 1] > 0])
            status <- factor(ifelse(status == 0, 0, as.numeric(etype)), 
                labels = newlev)
            if (attr(Y, "type") == "right") 
                Y <- Surv(Y[, 1], status, type = "mstate")
            else if (attr(Y, "type") == "counting") 
                Y <- Surv(Y[, 1], Y[, 2], status, type = "mstate")
            else stop("etype argument incompatable with survival type")
        }
        if (attr(Y, "type") == "left" || attr(Y, "type") == "interval") 
            temp <- survfitTurnbull(X, Y, casewt, ...)
        else if (attr(Y, "type") == "right" || attr(Y, "type") == 
            "counting") 
            temp <- survfitKM(X, Y, casewt, ...)
        else if (attr(Y, "type") == "mright" || attr(Y, "type") == 
            "mcounting") 
            temp <- survfitCI(X, Y, weights = casewt, id = id, istate = istate, 
                ...)
        else {
            stop("unrecognized survival type")
        }
        if (is.null(temp$states)) 
            class(temp) <- "survfit"
        else class(temp) <- c("survfitms", "survfit")
        if (!is.null(attr(m, "na.action"))) 
            temp$na.action <- attr(m, "na.action")
        temp$call <- Call
        temp
    }
    <bytecode: 0x000000003f6a8c28>
    <environment: namespace:survival>
    

    We scan this and eventually notice a call to survfitCI close to the end. Sounds like what we are looking for. So once again into the breech:

    > getAnywhere("survfitCI")
    A single object matching ‘survfitCI’ was found
    It was found in the following places
      package:survival
      namespace:survival
    with value
    
    function (X, Y, weights, id, istate, type = c("kaplan-meier", 
        "fleming-harrington", "fh2"), se.fit = TRUE, conf.int = 0.95, 
        conf.type = c("log", "log-log", "plain", "none"), conf.lower = c("usual", 
            "peto", "modified")) 
    {
        method <- match.arg(type)
        conf.type <- match.arg(conf.type)
        conf.lower <- match.arg(conf.lower)
        if (is.logical(conf.int)) {
            if (!conf.int) 
                conf.type <- "none"
            conf.int <- 0.95
        }
        type <- attr(Y, "type")
        if (type != "mright" && type != "mcounting" && type != "right" && 
            type != "counting") 
            stop(paste("Cumulative incidence computation doesn't support \"", 
                type, "\" survival data", sep = ""))
        n <- nrow(Y)
        status <- Y[, ncol(Y)]
        ncurve <- length(levels(X))
        state.names <- attr(Y, "states")
        if (missing(istate) || is.null(istate)) 
            istate <- rep(0L, n)
        else if (is.factor(istate) || is.character(istate)) {
            temp <- as.factor(istate)
            appear <- (levels(istate))[unique(as.numeric(istate))]
            state.names <- unique(c(attr(Y, "states"), appear))
            istate <- as.numeric(factor(as.character(istate), levels = state.names))
        }
        else if (!is.numeric(istate) || any(istate != floor(istate))) 
            stop("istate should be a vector of integers or a factor")
        if (length(id) == 0) 
            id <- 1:n
        if (length(istate) == 1) 
            istate <- rep(istate, n)
        if (length(istate) != n) 
            stop("wrong length for istate")
        states <- sort(unique(c(istate, 1:length(attr(Y, "states")))))
        docurve2 <- function(entry, etime, status, istate, wt, states, 
            id, se.fit) {
            ftime <- factor(c(entry, etime))
            ltime <- levels(ftime)
            ftime <- matrix(as.integer(ftime), ncol = 2)
            timeset <- as.numeric(ltime[sort(unique(ftime[, 2]))])
            nstate <- length(states)
            uid <- sort(unique(id))
            P <- as.vector(tapply(wt, factor(istate, levels = states), 
                sum)/sum(wt))
            P <- ifelse(is.na(P), 0, P)
            cstate <- istate[match(uid, id)]
            storage.mode(wt) <- "double"
            storage.mode(cstate) <- "integer"
            storage.mode(status) <- "integer"
            fit <- .Call(Csurvfitci, ftime, order(ftime[, 1]) - 1L, 
                order(ftime[, 2]) - 1L, length(timeset), status, 
                cstate - 1L, wt, match(id, uid) - 1L, P, as.integer(se.fit))
            prev0 <- table(factor(cstate, levels = states), exclude = NA)/length(cstate)
            if (se.fit) 
                list(time = timeset, pmat = t(fit$p), std = sqrt(t(fit$var)), 
                    n.risk = colSums(fit$nrisk), n.event = fit$nevent, 
                    n.censor = fit$ncensor, prev0 = prev0, cumhaz = array(fit$cumhaz, 
                      dim = c(nstate, nstate, length(timeset))))
            else list(time = timeset, pmat = t(fit$p), n.risk = colSums(fit$nrisk), 
                n.event = fit$nevent, n.censor = fit$ncensor, prev0 = prev0, 
                cumhaz = array(fit$cumhaz, dim = c(nstate, nstate, 
                    length(timeset))))
        }
        if (any(states == 0)) {
            state0 <- TRUE
            states <- states + 1
            istate <- istate + 1
            status <- ifelse(status == 0, 0, status + 1)
        }
        else state0 <- FALSE
        curves <- vector("list", ncurve)
        names(curves) <- levels(X)
        if (ncol(Y) == 2) {
            indx <- which(status == istate & status != 0)
            if (length(indx)) {
                warning("an observation transitions to it's starting state, transition ignored")
                status[indx] <- 0
            }
            if (length(id) && any(duplicated(id))) 
                stop("Cannot have duplicate id values with (time, status) data")
            entry <- rep(min(-1, 2 * min(Y[, 1]) - 1), n)
            for (i in levels(X)) {
                indx <- which(X == i)
                curves[[i]] <- docurve2(entry[indx], Y[indx, 1], 
                    status[indx], istate[indx], weights[indx], states, 
                    id[indx], se.fit)
            }
        }
        else {
            if (missing(id) || is.null(id)) 
                stop("the id argument is required for start:stop data")
            indx <- order(id, Y[, 2])
            indx1 <- c(NA, indx)
            indx2 <- c(indx, NA)
            same <- (id[indx1] == id[indx2] & !is.na(indx1) & !is.na(indx2))
            if (any(same & X[indx1] != X[indx2])) {
                who <- 1 + min(which(same & X[indx1] != X[indx2]))
                stop("subject is in two different groups, id ", (id[indx1])[who])
            }
            if (any(same & Y[indx1, 2] != Y[indx2, 1])) {
                who <- 1 + min(which(same & Y[indx1, 2] != Y[indx2, 
                    1]))
                stop("gap in follow-up, id ", (id[indx1])[who])
            }
            if (any(Y[, 1] == Y[, 2])) 
                stop("cannot have start time == stop time")
            if (any(same & Y[indx1, 3] == Y[indx2, 3] & Y[indx1, 
                3] != 0)) {
                who <- 1 + min(which(same & Y[indx1, 1] != Y[indx2, 
                    2]))
                warning("subject changes to the same state, id ", 
                    (id[indx1])[who])
            }
            if (any(same & weights[indx1] != weights[indx2])) {
                who <- 1 + min(which(same & weights[indx1] != weights[indx2]))
                stop("subject changes case weights, id ", (id[indx1])[who])
            }
            indx <- order(Y[, 2])
            uid <- unique(id)
            temp <- (istate[indx])[match(uid, id[indx])]
            istate <- temp[match(id, uid)]
            for (i in levels(X)) {
                indx <- which(X == i)
                curves[[i]] <- docurve2(Y[indx, 1], Y[indx, 2], status[indx], 
                    istate[indx], weights[indx], states, id[indx], 
                    se.fit)
            }
        }
        grabit <- function(clist, element) {
            temp <- (clist[[1]][[element]])
            if (is.matrix(temp)) {
                nc <- ncol(temp)
                matrix(unlist(lapply(clist, function(x) t(x[[element]]))), 
                    byrow = T, ncol = nc)
            }
            else {
                xx <- as.vector(unlist(lapply(clist, function(x) x[element])))
                if (class(temp) == "table") 
                    matrix(xx, byrow = T, ncol = length(temp))
                else xx
            }
        }
        kfit <- list(n = as.vector(table(X)), time = grabit(curves, 
            "time"), n.risk = grabit(curves, "n.risk"), n.event = grabit(curves, 
            "n.event"), n.censor = grabit(curves, "n.censor"), prev = grabit(curves, 
            "pmat"), prev0 = grabit(curves, "prev0"))
        nstate <- length(states)
        kfit$cumhaz <- array(unlist(lapply(curves, function(x) x$cumhaz)), 
            dim = c(nstate, nstate, length(kfit$time)))
        if (length(curves) > 1) 
            kfit$strata <- unlist(lapply(curves, function(x) length(x$time)))
        if (se.fit) 
            kfit$std.err <- grabit(curves, "std")
        if (state0) {
            kfit$prev <- kfit$prev[, -1]
            if (se.fit) 
                kfit$std.err <- kfit$std.err[, -1]
            kfit$prev0 <- kfit$prev0[, -1]
        }
        if (se.fit) {
            std.err <- kfit$std.err
            zval <- qnorm(1 - (1 - conf.int)/2, 0, 1)
            surv <- 1 - kfit$prev
            if (conf.type == "plain") {
                temp <- zval * std.err
                kfit <- c(kfit, list(lower = pmax(kfit$prev - temp, 
                    0), upper = pmin(kfit$prev + temp, 1), conf.type = "plain", 
                    conf.int = conf.int))
            }
            if (conf.type == "log") {
                xx <- ifelse(kfit$prev == 1, 1, 1 - kfit$prev)
                temp1 <- ifelse(surv == 0, NA, exp(log(xx) + zval * 
                    std.err/xx))
                temp2 <- ifelse(surv == 0, NA, exp(log(xx) - zval * 
                    std.err/xx))
                kfit <- c(kfit, list(lower = pmax(1 - temp1, 0), 
                    upper = 1 - temp2, conf.type = "log", conf.int = conf.int))
            }
            if (conf.type == "log-log") {
                who <- (surv == 0 | surv == 1)
                temp3 <- ifelse(surv == 0, NA, 1)
                xx <- ifelse(who, 0.1, kfit$surv)
                temp1 <- exp(-exp(log(-log(xx)) + zval * std.err/(xx * 
                    log(xx))))
                temp1 <- ifelse(who, temp3, temp1)
                temp2 <- exp(-exp(log(-log(xx)) - zval * std.err/(xx * 
                    log(xx))))
                temp2 <- ifelse(who, temp3, temp2)
                kfit <- c(kfit, list(lower = 1 - temp1, upper = 1 - 
                    temp2, conf.type = "log-log", conf.int = conf.int))
            }
        }
        kfit$states <- state.names
        kfit$type <- attr(Y, "type")
        kfit
    }
    <bytecode: 0x000000002ce81838>
    <environment: namespace:survival>
    

    Somewhere in there is your answer.