Search code examples
rregexdplyrstringracross

How do I elegantly str_detect across multiple columns and populating new columns conditionally


As you can see, I'm dealing with some serious dirty data. This code works, but looks a bit clunky. Is there a more efficient and dynamic way to achieve the final results without so much coding?

I had to do this in stages where first to flag the content type, and utilizing the content type to populate them into respective column types.

appreciate your help

#load library
library(dplyr)
library(stringr)
library(lubridate)

#create sample data
df <- tibble(c1 = c('9996155', '4001096', '4001525', '4000590','2020-01-23', '2019-12-23', '2020-01-20', '2019-12-08'),
             c2 = c('4001902', '5000009', '2020-01-23', '2019-12-23', '2020-01-20', '2019-12-08', '4000461', '4000311'),
             c3 = c('W-7', 'W-8', 'W-9', 'W-2', 'W-1', 'W-1','3.527E+20', '3.498E+20'),
             c4 = c('B09/20', 'B04/20', 'B05/20', 'B10/20', 'B06/20',  '3.408E+20', '3.229E+20', '3.225E+20')
             )

The data looks like this

> df
# A tibble: 8 x 4
  c1         c2         c3        c4       
  <chr>      <chr>      <chr>     <chr>    
1 9996155    4001902    W-7       B09/20   
2 4001096    5000009    W-8       B04/20   
3 4001525    2020-01-23 W-9       B05/20   
4 4000590    2019-12-23 W-2       B10/20   
5 2020-01-23 2020-01-20 W-1       B06/20   
6 2019-12-23 2019-12-08 W-1       3.408E+20
7 2020-01-20 4000461    3.527E+20 3.229E+20
8 2019-12-08 4000311    3.498E+20 3.225E+20

I have done something like this for getting it in shape


df %>%
  mutate(across(#flag them now to allow next step for data population
    starts_with('c'),
    ~ case_when(
      is.na(.) ~ NA_character_,
      str_detect(., regex('(^20[1,2][0-9]\\-)|(\\/20[1,2][0-9]$)')) ~ 'date',
      str_detect(., regex('\\d\\.\\d{3}[eE][+-]\\d{2}+')) ~ 'numericScientificNotation',
      str_detect(.,regex('(^[a-zA-Z][0-9]{2}\\/2[0-1]{1}$)|(^[A-Z]{1,2}\\-\\d.*[a-zA-Z]*$)|(^[a-zA-Z][0-9]{2})|(^[A-Z][0-9]$)')) ~ 'batches',
      str_detect(., regex('^-?\\d+$')) ~ 'integers',
      TRUE ~ NA_character_
    ),
    .names = paste0('test', "_{col}")
  )) %>% #casewhen to populate new columns
  mutate(integer = case_when(test_c1 == 'integers' ~ c1,
                             test_c2 == 'integers' ~ c2,
                             test_c3 == 'integers' ~ c3,
                             test_c4 == 'integers' ~ c4),
         date = case_when(test_c1 == 'date' ~ c1,
                             test_c2 == 'date' ~ c2,
                             test_c3 == 'date' ~ c3,
                             test_c4 == 'date' ~ c4),
         batches = case_when(test_c1 == 'batches' ~ c1,
                               test_c2 == 'batches' ~ c2,
                               test_c3 == 'batches' ~ c3,
                               test_c4 == 'batches' ~ c4),
         numericScientificNotation = case_when(test_c1 == 'numericScientificNotation' ~ c1,
                               test_c2 == 'numericScientificNotation' ~ c2,
                               test_c3 == 'numericScientificNotation' ~ c3,
                               test_c4 == 'numericScientificNotation' ~ c4)
         ) %>% 
  select(9:12) #this is all that i need

Require only this organized output.

Thanks!

# A tibble: 8 x 4
  integer date       batches numericScientificNotation
  <chr>   <chr>      <chr>   <chr>                    
1 9996155 NA         W-7     NA                       
2 4001096 NA         W-8     NA                       
3 4001525 2020-01-23 W-9     NA                       
4 4000590 2019-12-23 W-2     NA                       
5 NA      2020-01-23 W-1     NA                       
6 NA      2019-12-23 W-1     3.408E+20                
7 4000461 2020-01-20 NA      3.527E+20                
8 4000311 2019-12-08 NA      3.498E+20                

Solution

  • Here's a way to simplify this and reduce repetition :

    library(dplyr)
    
    regex_list <- list(date = '(^20[1,2][0-9]\\-)|(\\/20[1,2][0-9]$)', 
                      numericScientificNotation = '\\d\\.\\d{3}[eE][+-]\\d{2}+', 
                      batches = '(^[a-zA-Z][0-9]{2}\\/2[0-1]{1}$)|(^[A-Z]{1,2}\\-\\d.*[a-zA-Z]*$)|(^[a-zA-Z][0-9]{2})|(^[A-Z][0-9]$)', 
                      integers = '^-?\\d+$')
    
    
    purrr::imap_dfc(regex_list, function(x, y) 
                      df %>%
                        mutate(across(.fns = ~ifelse(str_detect(.x, x), .x, NA))) %>%
                        transmute(!!y := do.call(coalesce, .)))
    
    #  date       numericScientificNotation batches integers
    #  <chr>      <chr>                     <chr>   <chr>   
    #1 NA         NA                        W-7     9996155 
    #2 NA         NA                        W-8     4001096 
    #3 2020-01-23 NA                        W-9     4001525 
    #4 2019-12-23 NA                        W-2     4000590 
    #5 2020-01-23 NA                        W-1     NA      
    #6 2019-12-23 3.408E+20                 W-1     NA      
    #7 2020-01-20 3.527E+20                 NA      4000461 
    #8 2019-12-08 3.498E+20                 NA      4000311