Search code examples
rstringloopssubstring

How to loop through variables and assign a value of 1 based on multiple substr values at given character locations?


I'm moving from SAS to R and i'm trying to convert the following code from SAS to R:

data Drug_inj;
set initial;
array odiag(25) odiag1-odiag25;
do i = 1 to 25;
if substrn(odiag(i), 1,3) = 'T36' and (substrn(odiag(i), 6,1) = '1') then total_drug = 1;
if substrn(odiag(i), 1,3) = 'T37' and (substrn(odiag(i), 6,1) = '1') then total_drug = 1;
if substrn(odiag(i), 1,3) = 'T38' and (substrn(odiag(i), 6,1) = '1') then total_drug = 1;
if substrn(odiag(i), 1,3) = 'T39' and (substrn(odiag(i), 6,1) = '1') then total_drug = 1;
if substrn(odiag(i), 1,2) = 'T4'  and (substrn(odiag(i), 6,1) = '1') then total_drug = 1;
if substrn(odiag(i), 1,3) = 'T50' and (substrn(odiag(i), 6,1) = '1') then total_drug = 1;
if substrn(odiag(i), 1,4) = 'T369' == 1 then total_drug = 1;
end;

Just like the SAS code, I want the R code to look at the variables "odiag1" through "odiag25" in turn and give the variable "total_drug" a value of "1" on each row where conditions are met. Specifically conditions where the first 3 characters of the string start with "T36" through "T50" and the 6th character is equal to "1".

so far, I've figured that one of the easiest ways to do a loop in R is by creating a list

my_list <- list("odiag1" "odiag2" "odiag3" "odiag4" etc. "odiag25")

for i in length(my_list))
{

Drug_inj$total_drug = ifelse(substr(Drug_inj$i, 1,3 == "T36") & substr(Drug_inj$i, 6,1 == "1")) == 1
                      ifelse(substr(Drug_inj$i, 1,3 == "T37") & substr(Drug_inj$i, 6,1 == "1")) == 1
                      ifelse(substr(Drug_inj$i, 1,3 == "T38") & substr(Drug_inj$i, 6,1 == "1")) == 1
                      ifelse(substr(Drug_inj$i, 1,3 == "T39") & substr(Drug_inj$i, 6,1 == "1")) == 1
                      ifelse(substr(Drug_inj$i, 1,2 == "T4") & substr(Drug_inj$i, 6,1 == "1")) == 1
                      ifelse(substr(Drug_inj$i, 1,3 == "T50") & substr(Drug_inj$i, 6,1 == "1")) == 1
                      ifelse(substr(Drug_inj$i, 1,4 == "T369")) == 1}

I'm getting this error " Error in substr(pdd_master$i, 1, 3 == "T36") & substr(pdd_master$i, 6, : operations are possible only for numeric, logical or complex types "

The code I'm using is modified from some code a coworker gave me. Unfortunately, their codes only need the first 3 or 4 characters, so they used the following code.

pdd_master<- pdd_master %>% 
  unite(all_causes, odiag1, odiag2, odiag3, odiag4, odiag5, odiag6, odiag7,
        odiag8, odiag9, odiag10, odiag11, odiag12, odiag13, odiag14, odiag15, odiag16,
        odiag17, odiag18, odiag19, odiag20, odiag21, odiag22, odiag23, odiag24, 
        odiag25 sep = " ", remove = FALSE)

pdd_master$total_drug_unint = ifelse(str_detect(pdd_master$all_causes, "T36")==T,1,                                         
                              ifelse(str_detect(pdd_master$all_causes, "T37")==T,1,
                              ifelse(str_detect(pdd_master$all_causes, "T38")==T,1,
                              ifelse(str_detect(pdd_master$all_causes, "T39")==T,1,
                              ifelse(str_detect(pdd_master$all_causes, "T4")==T,1,
                              ifelse(str_detect(pdd_master$all_causes, "T50")==T,1,
                              ifelse(str_detect(pdd_master$all_causes, "T3691")==T,1,0)))))))))

Since their version doesn't need to look at characters in the first 2 or 3 positions in addition to characters in the 6th position I figured I needed to do a more traditional loop.


Solution

  • Using dplyr::if_any(), you can apply your test across multiple columns using tidyselect syntax. You can also collapse all your tests into one by testing if the beginning substring is in the range paste0("T", 36:50):

    library(dplyr)
    
    Drug_inj %>% 
      mutate(total_drug = as.integer(
        if_any(
          .cols = odiag1:odiag5, 
          .fns = ~ (substr(.x, 1, 3) %in% paste0("T", 36:50) & substr(.x, 6, 6) == 1) |
            substr(.x, 1, 4) == "T369"
        )
      ))
    
       id odiag1 odiag2 odiag3 odiag4 odiag5 total_drug
    1   1 T69880 T48900 T15200 T19781 T96201          0
    2   2 T17160 T57341 T77861 T11291 T54481          0
    3   3 T58691 T23971 T98041 T70501 T44780          0
    4   4 T19430 T69631 T86840 T94860 T21231          0
    5   5 T90850 T73650 T59201 T27471 T24791          0
    6   6 T36911 T57890 T20900 T33501 T78321          1
    7   7 T94121 T43891 T54210 T83670 T73520          1
    8   8 T53430 T93100 T71920 T40301 T29870          1
    9   9 T71301 T75980 T83571 T66510 T73021          0
    10 10 T85040 T42281 T31631 T82660 T98990          1
    11 11 T80390 T66010 T91921 T61350 T68470          0
    12 12 T69930 T24641 T91030 T82221 T43860          0
    13 13 T85660 T39360 T54991 T28981 T64351          0
    14 14 T99820 T88390 T88320 T65480 T17440          0
    15 15 T40760 T36190 T44520 T27561 T99881          0
    16 16 T28401 T69920 T97600 T75070 T42180          0
    17 17 T66851 T55650 T28491 T45501 T97011          1
    18 18 T88631 T27251 T37961 T67121 T57060          1
    19 19 T30791 T57310 T88331 T79461 T37131          1
    20 20 T62440 T81541 T65160 T68280 T41260          0
    

    Example data:

    set.seed(13)
    
    Drug_inj <- data.frame(id = 1:20)
    
    for (i in 1:5) {
      Drug_inj[[paste0("odiag", i)]] <- paste0(
        "T", 
        sample(1000:9999, 20), 
        sample(0:1, 20, replace = TRUE)
      )
    }