Search code examples
rdatabricksmutate

R in Databricks - mutate gives error when using case_when


Using R scripting in Databricks, I want to change the "numeric" to a new "~ numeric" and create a new variable called dc_17$strength_conc_per_uom.

Can anyone give an insight as to why this mutate code only gives errors when using case_when.

dc_17$strength_extract <- as.character(dc_17$strength_extract)
dc_17$strength_conc_per_uom <- dc_17 %>% mutate(strength_extract = case_when(
                          str_detect(strength_extract, "1") ~ 1,
                          str_detect(strength_extract, "8") ~ 8, 
                          str_detect(strength_extract, "20") ~ 20,
                          str_detect(strength_extract, "30") ~ 30,
                          str_detect(strength_extract, "55") ~ 1,
                          str_detect(strength_extract, "105") ~ 2,
                          str_detect(strength_extract, "155") ~ 3,
                          str_detect(strength_extract, "255") ~ 5,
                          str_detect(strength_extract, "505") ~ 10,
                          str_detect(strength_extract, "1005") ~ 20,
                          str_detect(strength_extract, "3030") ~ 1,
                          str_detect(strength_extract, "5050") ~ 1,
                          str_detect(strength_extract, "6060") ~ 1,
                     TRUE ~ NA_character_))

I continue to get an error for this

Error in mutate(., strength_extract = case_when(str_detect(strength_extract,  : 
  Problem while computing `strength_extract = case_when(...)`.
Caused by error in `case_when()`:
Error in `mutate()`:
Error in `mutate()`:
! Problem while computing `strength_extract = case_when(...)`.
Caused by error in `case_when()`:

also tried the following to resolve the issue, however they also gives errors.

dc_17$strength_conc_per_uom <- NA_integer_
dc_17$strength_extract <- as.character(dc_17$strength_extract)

dc_17$strength_conc_per_uom[str_detect(dc_17$strength_extract, "1")] <- 1
dc_17$strength_conc_per_uom[str_detect(dc_17$strength_extract, "8")] <- 8
dc_17$strength_conc_per_uom[str_detect(dc_17$strength_extract, "20")] <- 20
dc_17$strength_conc_per_uom[str_detect(dc_17$strength_extract, "30")] <- 30
dc_17$strength_conc_per_uom[str_detect(dc_17$strength_extract, "55")] <- 1
dc_17$strength_conc_per_uom[str_detect(dc_17$strength_extract, "105")] <- 2
dc_17$strength_conc_per_uom[str_detect(dc_17$strength_extract, "155")] <- 3
dc_17$strength_conc_per_uom[str_detect(dc_17$strength_extract, "255")] <- 5
dc_17$strength_conc_per_uom[str_detect(dc_17$strength_extract, "505")] <- 10
dc_17$strength_conc_per_uom[str_detect(dc_17$strength_extract, "1005")] <- 20
dc_17$strength_conc_per_uom[str_detect(dc_17$strength_extract, "3030")] <- 1
dc_17$strength_conc_per_uom[str_detect(dc_17$strength_extract, "5050")] <- 1
dc_17$strength_conc_per_uom[str_detect(dc_17$strength_extract, "6060")] <- 1

This approach returns only NAs

and

dc_17$strength_extract <- as.character(dc_17$strength_extract)

dc_17$strength_conc_per_uom <- ifelse(dc_17$strength_extract == "1", 1,
ifelse(dc_17$strength_extract == "8", 8,
ifelse(dc_17$strength_extract == "20", 20,
ifelse(dc_17$strength_extract == "30", 30,
ifelse(dc_17$strength_extract == "55", 1,
ifelse(dc_17$strength_extract == "105", 2,
ifelse(dc_17$strength_extract == "155", 3,
ifelse(dc_17$strength_extract == "255", 5,
ifelse(dc_17$strength_extract == "505", 10,
ifelse(dc_17$strength_extract == "1005", 20,
ifelse(dc_17$strength_extract == "3030", 1,
ifelse(dc_17$strength_extract == "5050", 1,
ifelse(dc_17$strength_extract == "6060", 1, NA)))))))))))))

this approach drops values 2, 3, 5 and 10 in the output


Solution

  • You can easily avoid such tedious ifelse excesses.

    Consider this data frame.

    df
    #    strength          X
    # 1       255 0.20514564
    # 2        20 0.50183656
    # 3         1 0.15259178
    # 4       155 0.15909367
    # 5      1005 0.68420944
    # 6       505 0.80878910
    # 7        55 0.57800968
    # 8      6060 0.88166418
    # 9       105 0.08513027
    # 10        8 0.36283751
    # 11     5050 0.91330926
    # 12     3030 0.65183322
    # 13       30 0.02780792
    

    Create a substitution vector,

    u <- c(`1`=1, `8`=8, `20`=20, `30`=30, `55`=1, `105`=2, `155`=3, `255`=5, `505`=10, 
           `1005`=20, `3030`=1, `5050`=1, `6060`=1)
    

    and match it with the desired column like so:

    df$strength <- u[match(df$strength, names(u))]
    

    Gives

    df
    #    strength          X
    # 1         5 0.20514564
    # 2        20 0.50183656
    # 3         1 0.15259178
    # 4         3 0.15909367
    # 5        20 0.68420944
    # 6        10 0.80878910
    # 7         1 0.57800968
    # 8         1 0.88166418
    # 9         2 0.08513027
    # 10        8 0.36283751
    # 11        1 0.91330926
    # 12        1 0.65183322
    # 13       30 0.02780792
    

    Data:

    df <- structure(list(strength = c("3030", "8", "1", "255", "105", "505", 
    "30", "1005", "6060", "5050", "20", "155", "55"), X = c(0.205145640065894, 
    0.501836559036747, 0.15259177563712, 0.159093674505129, 0.684209441067651, 
    0.808789104921743, 0.578009682707489, 0.881664179963991, 0.0851302673108876, 
    0.362837513675913, 0.913309262134135, 0.651833220385015, 0.0278079155832529
    )), class = "data.frame", row.names = c(NA, -13L))