Search code examples
rdplyr

Looping through multiple columns to apply conditional transformations


I am trying to create the following loop:

Loop through columns_to_check (10 columns), if there is no value (i.e. NA) then NA. End loop. If not NA, then check the following conditions:

  • If value is 2 or 4, then corresponding new column value should be 0.
  • If it is the first occurrence of 1,3 or 5 in the row then corresponding new value should be 1.
  • If it isn't the first occurrence of 1,3 or 5 in the row then corresponding new value should be 0.
  • Else 999.

Create 10 new columns with the result of the loop and label them stp_result_ (value 1 to 10.

Here's my dataframe:

df <- structure (list(
  subject_id = c("5467", "6784", "3457", "0987", "1245", "1945","3468", "0012","0823","0812"), 
  stp_t_1 = c(1,3,5,1,2,5,4,3,3,1),
  stp_t1_cor = c(0,0,0,0,0,0,0,0,0,0), 
  stp_t1_cor_num = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  stp_t_2 = c(2,5,1,3,5,1,3,2,2,3), 
  stp_t2_cor = c(1,0,0,0,0,0,0,0,0,0), 
  stp_t2_cor_num = c(1,NA,NA,NA,NA,NA,NA,NA,NA,NA), 
  stp_t_3 = c(3,2,5,4,3,3,3,3,1,5),
  stp_t3_cor = c(0,1,0,0,0,0,0,0,0,0),
  stp_t3_cor_num = c(NA,4,NA,NA,NA,NA,NA,NA,NA),
  stp_t_4 = c(4,1,4,3,NA,NA,1,2,5,NA),
  stp_t4_cor = c(1,0,0,0,NA,NA,0,0,0,0),
  stp_t4_cor_num = c(1,NA,NA,NA,NA,NA,NA,NA,NA),
  stp_t_5 = c(5,NA,3,1,NA,NA,1,3,NA,NA),
  stp_t5_cor = c(0,NA,0,0,NA,NA,0,0,NA,NA),
  stp_t5_cor_num = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  stp_t_6 = c(NA,NA,NA,NA,NA,NA,4,4,NA,NA),
  stp_t6_cor = c(NA,NA,NA,NA,NA,NA,0,0,NA,NA), 
  stp_t6_cor_num = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  stp_t_7 = c(NA,NA,NA,NA,NA,NA,5,5,NA,NA), 
  stp_t7_cor = c(NA,NA,NA,NA,NA,NA,0,0,NA,NA), 
  stp_t7_cor_num = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA), 
  stp_t_8 = c(NA,NA,NA,NA,NA,NA,2,1,NA,NA),
  stp_t8_cor = c(NA,NA,NA,NA,NA,NA,0,0,NA,NA),
  stp_t8_cor_num = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  stp_t_9 = c(NA,NA,NA,NA,NA,NA,1,3,NA,NA),
  stp_t9_cor = c(NA,NA,NA,NA,NA,NA,0,0,NA,NA),
  stp_t9_cor_num = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
  stp_t_10 = c(NA,NA,NA,NA,NA,NA,1,2,NA,NA),
  stp_t10_cor = c(NA,NA,NA,NA,NA,NA,0,0,NA,NA),
  stp_t10_cor_num = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)), 
  class = "data.frame", row.names = c(NA, -10L))

Here's what I have tried:

columns_to_check <- grep("stp_t_", names(df), value = TRUE)

for (i in 1:length(columns_to_check)) {
  col_name <- columns_to_check[i]
  new_col_name <- paste0("stp_result_", i)
  
  result <- rep(NA, nrow(df))
  
  for (j in 1:nrow(df)) {
    value <- df[[col_name]][j]
    
    if (is.na(value)) {
      result[j] <- NA 
    } else if (value %in% c(2, 4)) {
      result[j] <- 0   
    } else if (value %in% c(1, 3, 5)) {
      if (value %in% c(1, 3, 5) && !any(!is.na(result[1:j - 1]) & result[1:j - 1] == 1)) {
        result[j] <- 1   
      } else {
        result[j] <- 0   
      }
    } else {
      result[j] <- 999 
    }
  }
  df[[new_col_name]] <- result
}

I think the entire loop just doesn't work. It results in a lot of 0's and incorrectly states others as 1. Not sure where I am going wrong.

Desired outcome would be the original df with the new columns:

subject_id  stp_result_1  stp_result_2  stp_result_3  stp_result_4  stp_result_5  stp_result_6  stp_result_7  stp_result_8  stp_result_9  stp_result_10
5467        1             0             1             0             1             NA            NA            NA            NA            NA
6784        1             1             0             1             NA            NA            NA            NA            NA            NA
3457        1             1             0             0             1             NA            NA            NA            NA            NA
0987        1             1             0             0             0             NA            NA            NA            NA            NA
1245        1             1             1             NA            NA            NA            NA            NA            NA            NA
1945        1             1             1             NA            NA            NA            NA            NA            NA            NA
3468        0             1             0             1             0             0             1             0             0             0
0012        1             0             0             0             0             0             1             1             0             0
0823        1             0             1             1             NA            NA            NA            NA            NA            NA
0812        1             1             1             NA            NA            NA            NA            NA            NA            NA

Solution

  • Here is a function that checks and applies the conditions without loops, for or other. The the code sapply's the function to each of the columns to check and returns a matrix that can be cbind'ed with the original data.frame.

    fun <- function(x) {
      # default value
      y <- rep(999, length(x))
      # assign NA to y if x is NA
      is.na(y) <- is.na(x)
      # assign 0 to all y where x is 2 or 4
      y[x %in% c(2, 4)] <- 0
      # now check values 1, 3, 5
      for(i in c(1, 3, 5)) {
        if(any(x == i, na.rm = TRUE)) {
          w <- which(x == i)
          # y gets a 1 for the first such values 
          y[ w[1L] ] <- 1
          # and a 0 if more exist
          if(length(w) > 1L) y[ w[-1L] ] <- 0
        }
      }
      y
    }
    
    columns_to_check <- grep("stp_t_", names(df), value = TRUE)
    
    res <- apply(df[columns_to_check], 1, fun) |> t()
    colnames(res) <- paste0("stp_result_", seq_along(columns_to_check))
    res
    
    cbind(df, res)
    

    Created on 2024-05-03 with reprex v2.1.0


    Edit

    See if this works:

    apply(df[columns_to_check], 1, fun) |> t()