Search code examples
rregexstringrstringi

edit string text in dataframe variable


I want to tidy up a dataframe and automate the process. Given the following data.frame:

library(survival)
library(rms)
library(broom)
library(tidyverse)
res.cox <- coxph(Surv(time, status) ~ rcs(age, 3) + sex + ph.ecog +
                   rcs(meal.cal, 4), data = lung)
output <- tidy(res.cox)
output
#   term                        estimate std.error statistic p.value
#   <chr>                          <dbl>     <dbl>     <dbl>   <dbl>
# 1 rcs(age, 3)age             -0.00306    0.0219     -0.140 0.889  
# 2 rcs(age, 3)age'             0.0154     0.0261      0.592 0.554  
# 3 sex                        -0.525      0.192      -2.74  0.00620
# 4 ph.ecog                     0.421      0.131       3.22  0.00128
# 5 rcs(meal.cal, 4)meal.cal   -0.000416   0.00104    -0.400 0.689  
# 6 rcs(meal.cal, 4)meal.cal'   0.00118    0.00232     0.509 0.611  
# 7 rcs(meal.cal, 4)meal.cal'' -0.00659    0.0114     -0.577 0.564  

I want to remove the rcs-spline information from term variable and be left with:

#   term         estimate std.error statistic p.value
#   <chr>           <dbl>     <dbl>     <dbl>   <dbl>
# 1 s1 age      -0.00306    0.0219     -0.140 0.889  
# 2 s2 age       0.0154     0.0261      0.592 0.554  
# 3 sex         -0.525      0.192      -2.74  0.00620
# 4 ph.ecog      0.421      0.131       3.22  0.00128
# 5 s1 meal.cal -0.000416   0.00104    -0.400 0.689  
# 6 s2 meal.cal  0.00118    0.00232     0.509 0.611  
# 7 s3 meal.cal -0.00659    0.0114     -0.577 0.564  

I want the solution to easily work for other cases too so when you increase the number of knots:

res.cox2 <- coxph(Surv(time, status) ~ rcs(age, 4) + rcs(meal.cal, 6) +
                   sex + ph.ecog, data = lung)
output2 <- tidy(res.cox2)
output2
#    term                          estimate std.error statistic  p.value
#    <chr>                            <dbl>     <dbl>     <dbl>    <dbl>
#  1 rcs(age, 4)age                0.0419     0.0403      1.04  0.298   
#  2 rcs(age, 4)age'              -0.101      0.0806     -1.26  0.208   
#  3 rcs(age, 4)age''              0.569      0.388       1.47  0.142   
#  4 rcs(meal.cal, 6)meal.cal     -0.000974   0.00155    -0.631 0.528   
#  5 rcs(meal.cal, 6)meal.cal'     0.00751    0.0115      0.655 0.512   
#  6 rcs(meal.cal, 6)meal.cal''   -0.0217     0.0358     -0.607 0.544   
#  7 rcs(meal.cal, 6)meal.cal'''   0.0614     0.123       0.501 0.616   
#  8 rcs(meal.cal, 6)meal.cal'''' -0.0775     0.163      -0.475 0.634   
#  9 sex                          -0.552      0.195      -2.83  0.00465 
# 10 ph.ecog                       0.440      0.132       3.34  0.000835

you would be left with:

#    term         estimate std.error statistic  p.value
#    <chr>           <dbl>     <dbl>     <dbl>    <dbl>
#  1 s1 age       0.0419     0.0403      1.04  0.298   
#  2 s2 age      -0.101      0.0806     -1.26  0.208   
#  3 s3 age       0.569      0.388       1.47  0.142   
#  4 s1 meal.cal -0.000974   0.00155    -0.631 0.528   
#  5 s2 meal.cal  0.00751    0.0115      0.655 0.512   
#  6 s3 meal.cal -0.0217     0.0358     -0.607 0.544   
#  7 s4 meal.cal  0.0614     0.123       0.501 0.616   
#  8 s5 meal.cal -0.0775     0.163      -0.475 0.634   
#  9 sex         -0.552      0.195      -2.83  0.00465 
# 10 ph.ecog      0.440      0.132       3.34  0.000835

etc...

My attempt so far gets me some of the way but I am not sure of the best way to deal with the ', '' (note the first term does not contain a ') etc.:

output %>% 
  mutate(rcs_indicator = str_detect(term, fixed("rcs(")),
         term = str_replace_all(term, "rcs\\(.+?\\)", ""))
#   term        estimate std.error statistic p.value rcs_indicator
#   <chr>          <dbl>     <dbl>     <dbl>   <dbl> <lgl>        
# 1 age        -0.00306    0.0219     -0.140 0.889   TRUE         
# 2 age'        0.0154     0.0261      0.592 0.554   TRUE         
# 3 sex        -0.525      0.192      -2.74  0.00620 FALSE        
# 4 ph.ecog     0.421      0.131       3.22  0.00128 FALSE        
# 5 meal.cal   -0.000416   0.00104    -0.400 0.689   TRUE         
# 6 meal.cal'   0.00118    0.00232     0.509 0.611   TRUE         
# 7 meal.cal'' -0.00659    0.0114     -0.577 0.564   TRUE 

It might be useful to just work with the terms I need to change directly:

unique(str_subset(output$term, fixed("rcs(")) %>% 
  str_replace_all("'", ""))
# [1] "rcs(age, 3)age"           "rcs(meal.cal, 4)meal.cal"

I feel there is a way to do this in a simpler way than the steps I am doing.

Any suggestions?

Thanks


Solution

  • This is also less elegant than desired, but should work for multiple knots

    output %>%
      mutate(is_spline = grepl("^rcs\\(.*?, \\d\\)", term),
             n_term = str_count(term, "'") + 1,
             pre = ifelse(is_spline, paste0('s', n_term, ' '), ""),
             term = paste0(pre, gsub("(^rcs\\(.*?, \\d\\))|(\\'+$)", "", term))) %>%
      select(-is_spline, -n_term, -pre)
    #> # A tibble: 7 x 5
    #>   term         estimate std.error statistic p.value
    #>   <chr>           <dbl>     <dbl>     <dbl>   <dbl>
    #> 1 s1 age      -0.00306    0.0219     -0.140 0.889  
    #> 2 s2 age       0.0154     0.0261      0.592 0.554  
    #> 3 sex         -0.525      0.192      -2.74  0.00620
    #> 4 ph.ecog      0.421      0.131       3.22  0.00128
    #> 5 s1 meal.cal -0.000416   0.00104    -0.400 0.689  
    #> 6 s2 meal.cal  0.00118    0.00232     0.509 0.611  
    #> 7 s3 meal.cal -0.00659    0.0114     -0.577 0.564