Search code examples
rdataframefunctioncharacter

Delete rows in which the same term DOESNOT appear before and after the minus sign


In data.frame, out below, the first column has two kinds of terms before and after the - sign:

(A) Terms in which the same word appear before and after the - sign (ex. Baseline in rows 1 and 2)

(B) Terms in which the NO same word appear before and after the - sign (ex. Rows 47 and 50)

Is there a way to create a function to delete rows of type (B) in the out data.frame?

library(emmeans)
dd <- read.csv("https://raw.githubusercontent.com/fpqq/w/main/1.csv")

res1 <- lm(gi ~ teaching_level*time, data = dd)

out <- na.omit(data.frame(emmeans(res1, pairwise ~ teaching_level*time)[[2]])) 
out
#                                              contrast    estimate        SE df     t.ratio     p.value
#1                 elementary Baseline - mixed Baseline  0.15185787 0.2895842 59  0.52439968 0.999994441
#2             elementary Baseline - secondary Baseline -0.10316420 0.2494777 59 -0.41352074 0.999999536
.
.
#47       (secondary Post-test 1) - (mixed Post-test 2) -1.03135871 0.5588269 59 -1.84557815 0.786224904
.
#50      (secondary Post-test 1) - (mixed Post-test 3) -0.78350792 0.5588269 59 -1.40205835 0.958572283
.
.

Solution

  • We may split the 'contrast' column into two at the space followed by -, then extract the words in each of those split columns, check if there are intersecting words to filter those rows

    library(dplyr)
    library(tidyr)
    library(stringr)
    library(tibble)
    out %>% 
      rownames_to_column('rn') %>%
      as_tibble %>% 
      separate(contrast, into = c('pre', 'post'), sep = "\\s+-\\s+", 
          remove = FALSE) %>%
      mutate(across(pre:post, ~ map(str_extract_all(., "[A-Za-z0-9-]+\\s*\\d*"), trimws))) %>%
      filter(lengths(map2(pre, post, intersect)) > 0) %>% 
      select(-pre, -post)
    

    -output

    # A tibble: 17 × 7
       rn    contrast                                           estimate    SE    df t.ratio p.value
       <chr> <chr>                                                 <dbl> <dbl> <dbl>   <dbl>   <dbl>
     1 1     elementary Baseline - mixed Baseline                 0.152  0.290    59  0.524  1.00   
     2 2     elementary Baseline - secondary Baseline            -0.103  0.249    59 -0.414  1.00   
     3 3     elementary Baseline - (elementary Post-test 1)      -0.869  0.205    59 -4.23   0.00433
     4 12    mixed Baseline - secondary Baseline                 -0.255  0.306    59 -0.833  0.999  
     5 14    mixed Baseline - (mixed Post-test 1)                -0.533  0.299    59 -1.78   0.822  
     6 17    mixed Baseline - (mixed Post-test 2)                -1.61   0.588    59 -2.74   0.232  
     7 20    mixed Baseline - (mixed Post-test 3)                -1.36   0.588    59 -2.32   0.475  
     8 24    secondary Baseline - (secondary Post-test 1)        -0.326  0.245    59 -1.33   0.971  
     9 27    secondary Baseline - (secondary Post-test 2)        -0.344  0.363    59 -0.945  0.998  
    10 31    (elementary Post-test 1) - (mixed Post-test 1)       0.488  0.219    59  2.23   0.537  
    11 32    (elementary Post-test 1) - (secondary Post-test 1)   0.440  0.200    59  2.20   0.557  
    12 39    (mixed Post-test 1) - (secondary Post-test 1)       -0.0484 0.237    59 -0.204  1.00   
    13 41    (mixed Post-test 1) - (mixed Post-test 2)           -1.08   0.566    59 -1.91   0.750  
    14 44    (mixed Post-test 1) - (mixed Post-test 3)           -0.832  0.566    59 -1.47   0.943  
    15 48    (secondary Post-test 1) - (secondary Post-test 2)   -0.0174 0.347    59 -0.0503 1      
    16 57    (mixed Post-test 2) - (secondary Post-test 2)        1.01   0.620    59  1.64   0.889  
    17 59    (mixed Post-test 2) - (mixed Post-test 3)            0.248  0.759    59  0.326  1.00   
    

    NOTE: Using the previous dataset in the OP's post


    With the new data

    # A tibble: 3 × 7
      rn    contrast                                       estimate    SE    df t.ratio p.value
      <chr> <chr>                                             <dbl> <dbl> <dbl>   <dbl>   <dbl>
    1 1     elementary Baseline - mixed Baseline              0.152 0.290    59   0.524 1.00   
    2 2     elementary Baseline - secondary Baseline         -0.103 0.249    59  -0.414 1.00   
    3 3     elementary Baseline - (elementary Post-test 1)   -0.869 0.205    59  -4.23  0.00433
    

    It can be wrapped in a function

    f1 <- function(data, contrast_col) {
           data %>%
                as_tibble %>%
                 separate({{contrast_col}}, into = c('pre', 'post'), sep = "\\s+-\\s+", remove = FALSE) %>%
                 mutate(across(pre:post, ~ map(str_extract_all(., "[A-Za-z0-9-]+\\s*\\d*"), trimws))) %>% 
                 filter(lengths(map2(pre, post, intersect)) > 0) %>%
                  select(-pre, -post)
                  
                }
                
    f1(out, contrast)
    # A tibble: 3 × 6
      contrast                                       estimate    SE    df t.ratio p.value
      <chr>                                             <dbl> <dbl> <dbl>   <dbl>   <dbl>
    1 elementary Baseline - mixed Baseline              0.152 0.290    59   0.524 1.00   
    2 elementary Baseline - secondary Baseline         -0.103 0.249    59  -0.414 1.00   
    3 elementary Baseline - (elementary Post-test 1)   -0.869 0.205    59  -4.23  0.00433