Search code examples
rrecursionpartitioningctree

Bootstrapping and recursive partitioning analysis with ctree


I am currently struggling with recursiving partition and bagging/bootstrapping of some data. As the data is confidential I have provided a reproducible example using the "GBSG2" data. In essense I am currently trying to reproduce an article recently published in Journal of Clinical Oncology (https://ascopubs.org/doi/abs/10.1200/JCO.22.02222) with my own data on an identical patient population.

I have attached prints of their method section and a supplemental tabel which is essentially what I hope to end up withMethod1Method2Supp_table

My problem can be boiled down to

  • I want to for each end-node extract the three year survival rate and then specify for each patient which group they belong to - group A >70%, B; 70-50, C; 50-25 and D less than 25%.
  • When bootstrapping afterwards the same needs to happen so I can see for each iteration which group a specific patient was allocated to and how often this happened.

Here is a some dummy code and what I've done thus far

library(partykit)
data("GBSG2", package = "TH.data")

#Dataframe
df <- GBSG2

#Ctree object
stree <- ctree(Surv(time,cens)~., data=df, control= ctree_control(minsplit = 50, alpha = 0.1, multiway = T))

#The following part I hope could be done more efficiently
n <- predict(stree, type="node")
nd <- factor(predict(stree, type="node"))
df$node <- n
fit1 <- survfit(Surv(time,cens)~nd, data=df)
summary(fit1, times=365*3)

#Manual input to each node by reading the transcript
df$grp <- ifelse(df$node==3, "A",NA)
df$grp <- ifelse(df$node==4, "A", df$grp)
df$grp <- ifelse(df$node==7, "C", df$grp)
df$grp <- ifelse(df$node==8, "D", df$grp)
df$grp <- ifelse(df$node==9, "B", df$grp)

I believe the above needs to be fixed before my bootstrap can be done in order to get a result which matches the attached supplemental table (I'd like to do it 1000 times, but I'm doing 10 until it works).

    #Bagging
df_bag <- df %>% 
  select(-"node", -"grp")
cf <- cforest(Surv(time,cens)~.,data=df_bag, ntree=10, mtry = Inf)

Thank you very much,

Tobias Berg


Solution

  • I've managed to find solutions for my questions

    library(partykit)
    library(survival)
    data("GBSG2", package = "TH.data")
    
    #Data
    df <- GBSG2
    
    #Ctree object
    stree <- ctree(Surv(time,cens)~., data=df, control= ctree_control(minsplit = 50, alpha = 0.1, multiway = T))
    
    #Prediciton for Recursive partitioning analysis
            n <- predict(stree, type="node")
        node <- factor(predict(stree, type="node"))
        df$node <- n
        fit1 <- survfit(Surv(time,event)~node, data=df)
        res <- summary(fit1, times=365*3) 
        cols <- lapply(c(6, 10), function(x) res[x])
        tbl <- do.call(data.frame, cols)
        tbl$strata <- as.integer(gsub("[^0-9]", "", tbl$strata))
        tbl <- tbl %>% 
          rename(node=strata)
        df <- df %>% 
          left_join(., tbl, by="node") %>% 
          mutate(grp=ifelse(surv>0.699999, "A", NA)) %>% 
          mutate(grp=ifelse(surv<0.70 & surv>0.49999, "B", grp)) %>% 
          mutate(grp=ifelse(surv<0.50 & surv>0.24999, "C", grp)) %>% 
          mutate(grp=ifelse(surv<0.25, "D", grp))
        
    #Bootstrapping with 10 iterations
    #Function which essentially does the above prediction and returns for each row the corresponding group
    classify_abcd = function (df_bag_in, pred_vector) {
      n <- pred_vector
      node <- factor(pred_vector)
      df_bag_in$node <- n
      fit1 <- survfit(Surv(time,event)~node, data=df_bag_in)
      res <- summary(fit1, times=365*3,extend = TRUE) 
      cols <- lapply(c(6, 10), function(x) res[x])
      tbl <- do.call(data.frame, cols)
      tbl$strata <- as.integer(gsub("[^0-9]", "", tbl$strata))
      tbl <- tbl %>% 
        rename(node=strata)
      df_bag_in <- df_bag_in %>% 
        left_join(., tbl, by="node") %>% 
        mutate(grp=ifelse(surv>0.699999, "A", NA)) %>% 
        mutate(grp=ifelse(surv<0.70 & surv>0.49999, "B", grp)) %>% 
        mutate(grp=ifelse(surv<0.50 & surv>0.24999, "C", grp)) %>% 
        mutate(grp=ifelse(surv<0.25, "D", grp))
      
      return(df_bag_in[c('grp')])
    }
    #Bootstrapping 10 iterations. End result is the data frame with each group assignment per iteration
    cf <- cforest(Surv(time,event)~.,data=df, ntree=10, mtry = Inf, trace=T)
    all_list_runs <- predict(cf, type="node")
    map_id_to_classes = data.frame()
    for(pred_vector in all_list_runs) {
      per_id_class = classify_abcd(df_rpa, pred_vector) 
      print(per_id_class)
      
      if (length(map_id_to_classes) == 0) {
        map_id_to_classes = per_id_class
      } else {
        map_id_to_classes = cbind(map_id_to_classes, per_id_class$grp)
      }
      
    }