Search code examples
rggplot2survival-analysis

How does one recombine a data back into a meaningful single survival curve after running SurvSplit and make predictions off of said curve?


I have a cox model that has a time dependence aspect to it that was resolved by splitting the curve into 3 pieces. After doing so and confirming that the proportionality is restored I would like to make a prediction with the cox model. The survival package documentation shows this but only for non split data.

How can I transform it back into a more interpretable single plot where I can check treatment vs control etc...

Minimum Example

# Clean your workspace environment
# rm(list = ls())

# Clear the plots tab (if any plots are open)
# if (!is.null(dev.list())) dev.off()

# Clear the console (equivalent to Ctrl+L shortcut)
# cat("\014")

library("survival") 
library("survminer")

vet2 <-survSplit(Surv(time, status) ~., data= veteran, cut=c(90, 180), episode= "tgroup", id="id")
vfit2 <-coxph(Surv(tstart, time, status) ~ trt + prior + karno:strata(tgroup),data=vet2)
scurve0 <- survfit(vfit2, newdata =  vet2)
#plot(scurve0)
ggsurvplot(scurve0, data=vfit2,legend = "none" ) 

plotdata <- data.frame( 
                          trt = c(0,0,0,0,0,0),
                          prior = c(0,10,0,10,0,10),
                          karno = c(60,99,60,99,60,99),
                          tgroup = c(1,2,3)
)
scurve1 <- survfit(vfit2, newdata =  plotdata)
ggsurvplot(scurve0, data=vfit2,legend = "none" ) 
scurve1 <- survfit(vfit2, newdata =  plotdata)

this produces the following plot for ** both ** plot calls. enter image description here

How can I get this back as a single curve? I need to turn the legend off because the strata creates so many cases. When I try to plot the model with a new data I get the exact same plot so I do not think it is working


Solution

  • It doesn't look like there is a straightforward solution to this problem, e.g. as commented by @IRTFM:

    "Terry Therneau says he doesn't know how to draw predicted survival curves when using models with time-varying coefficients. Search the r-help archives where this has been repeatedly discussed. This is what he said 2 years ago: "The "survival curve" for a time dependent covariate is something that is not easily defined. Read chapter 10.2.4 of the Therneau and Grambch book for a discussion of this (largely informed by the many mistakes I've myself made.)"

    However, I think you could use the approach outlined by @EdM in https://stats.stackexchange.com/questions/534516/survival-curve-from-time-dependent-coefficients-in-the-cox-model as a potential workaround (based on the 'log(time+20)' approach described in https://cran.r-project.org/web/packages/survival/vignettes/timedep.pdf).

    E.g. for your example data:

    library(survival)
    library(survminer)
    
    vfit <- coxph(Surv(time, status) ~ ., data = veteran)
    prop <- cox.zph(vfit)
    prop
    #>            chisq df       p
    #> trt       0.2644  1 0.60712
    #> celltype 15.2274  3 0.00163
    #> karno    12.9352  1 0.00032
    #> diagtime  0.0129  1 0.90961
    #> age       1.8288  1 0.17627
    #> prior     2.1656  1 0.14113
    #> GLOBAL   34.5525  8 3.2e-05
    plot(prop[3])
    abline(h = vfit$coefficients[5], lwd = 2, lty = 2, col = "firebrick3")
    

    zph plot

    
    vet2 <-survSplit(Surv(time, status) ~.,
                     data = veteran, cut = c(30, 60),
                     episode = "tgroup", id = "id")
    vfit2 <-coxph(Surv(tstart, time, status) ~ trt + prior + karno:strata(tgroup),
                  data = vet2)
    vfit2
    #> Call:
    #> coxph(formula = Surv(tstart, time, status) ~ trt + prior + karno:strata(tgroup), 
    #>     data = vet2)
    #> 
    #>                                   coef exp(coef)  se(coef)      z        p
    #> trt                          -0.010158  0.989894  0.190075 -0.053 0.957381
    #> prior                        -0.006735  0.993288  0.020252 -0.333 0.739479
    #> karno:strata(tgroup)tgroup=1 -0.052164  0.949173  0.008086 -6.451 1.11e-10
    #> karno:strata(tgroup)tgroup=2 -0.040127  0.960668  0.011541 -3.477 0.000507
    #> karno:strata(tgroup)tgroup=3 -0.008322  0.991712  0.008725 -0.954 0.340150
    #> 
    #> Likelihood ratio test=57.13  on 5 df, p=4.759e-11
    #> n= 305, number of events= 128
    prop2 <- cox.zph(vfit2)
    prop2
    #>                      chisq df     p
    #> trt                  1.217  1 0.270
    #> prior                3.898  1 0.048
    #> karno:strata(tgroup) 0.235  3 0.972
    #> GLOBAL               5.791  5 0.327
    plot(prop2[3])
    

    corrected zph plot

    
    plot(survfit(vfit2, newdata = vet2))
    

    surv plot

    
    ## tt() "log(time + 20)" approach
    dtimes <- sort(unique(veteran$time[veteran$status==1]))
    vet3 <-survSplit(Surv(time, status) ~.,
                     data = veteran, cut = dtimes,
                     episode = "tgroup", id = "id")
    vet3$time_var_karno <- vet3$karno * log(vet3$time+20)
    vfit4 <- coxph(Surv(tstart,time,status) ~ trt + prior + karno + time_var_karno, data = vet3)
    
    plotdata <- data.frame( 
      tstart = c(0, dtimes[1:96]),
      time = dtimes,
      trt = rep(0, 97),
      prior = rep(c(0, 5, 10), length.out = 97),
      karno = rep(30, 97),
      tgroup = rep(c(1,2,3), length.out = 97),
      status = rep(0, 97)
    )
    
    plotdata$time_var_karno <- plotdata$karno * log(plotdata$time + 20)
    plotdata2 <- plotdata
    plotdata2$karno <- 60
    plotdata2$time_var_karno <- plotdata2$karno * log(plotdata2$time + 20)
    plotdata3 <- plotdata
    plotdata3$karno <- 90
    plotdata3$time_var_karno <- plotdata3$karno * log(plotdata3$time + 20)
    
    plotdata$id <- "1"
    plotdata2$id <- "2"
    plotdata3$id <- "3"
    
    col_pal <- viridisLite::viridis(n = 4)
    plot(survfit(vfit4, newdata = rbind(plotdata,
                                        plotdata2,
                                        plotdata3),
                 id = id, se.fit = FALSE),
         bty = "n", xlab = "Days",
         ylab = "Survival probability",
         col = col_pal[1:3])
    text(160,0.15,"karno = 30", col = col_pal[1])
    text(200,0.4,"karno = 60", col = col_pal[2])
    text(240,0.6,"karno = 90", col = col_pal[3])
    

    corrected survplot

    
    # overlay
    plot(survfit(vfit2, newdata = vet2))
    lines(survfit(vfit4, newdata = rbind(plotdata,
                                         plotdata2,
                                         plotdata3),
                  id = id, se.fit = FALSE),
          bty = "n", xlab = "Days",
          ylab = "Survival probability",
          col = col_pal[1:3],
          lwd = 3)
    

    overlay

    Created on 2025-02-08 with reprex v2.1.1

    NB: I'm honestly not sure whether this answer is 'correct' or not. It looks good, i.e. the time_var_karno-corrected lines provide a very close approximation of the original ggsurvplot, but if you're publishing this in a journal I recommend asking a stats professor to critique it beforehand. I also think it would be worth asking for advice at https://stats.stackexchange.com and see if any of the 'heavy hitters' are willing to discuss further.