Search code examples
rdplyrcountrowspecial-characters

Filter and count rows with two conditions R


I am trying to filter out rows that consists of letters (no special characters like dots, brackets, etc.), which have from 2 to letters. I want to also count their frequencies.

The table

df = data.frame(stringsAsFactors = F,
                col1 = c(
                "[is]",
                "[is]",
                "[is]+[is]",
                "complex",
                "ta",
                "ta",
                "ta ta pa",
                "pum",
                "wam",
                "wam wam","puk","[is]+ pa", "[is]+ pa", "ta","wa pa [is] pa","wak wak")
                )

It looks like:

#         col1
#         
# 1       [is]
# 2       [is]
# 3  [is]+[is]
# 4    complex
# 5         ta
# 6         ta
# 7         ta
# 8   ta ta pa
# 9        pum
# 10       wam
# 11   wam wam

The output should look like this:

col1       N

ta         3
pum        1
wam        1


I used dplyr to grouped the col1 (there are more columns in my big table), filtered according to my conditions, and summarized the counts. The code below works like a charm and gives me what I want.

  group_by(col1) %>%
  filter(str_detect(col1, "^[[:alpha:]]{2,4}$")) %>%
  summarize(N = n())

I don't know though, how to include in a separate column also those counts, which appeared in other rows but were omitted. E.g. "ta" occurred twice in the row "ta ta pa". The target output is the one below:

col1       N    N_other

ta         3    2
pum        1    0
wam        1    2


Would appreciate a hint how to solve the problem. Hope my question is clear enough, if not please ask.


Solution

  • This is by no means pretty, but it might be helpful. This assumes you want 2-4 letter words extracted from col1, and then for those extracted words, to count up how many there are by themselves, as well as multiple in a single row/string.

    library(tidyverse)
    
    df %>%
      mutate(str_ex1 = str_extract(col1, "^[[:alpha:]]{2,4}$"),
             str_ex2 = str_extract(col1, paste(str_ex1, collapse = "|")),
             count1 = str_count(col1, paste(str_ex1, collapse = "|"))) %>%
      filter(count1 > 0) %>%
      group_by(str_ex1, str_ex2) %>%
      summarise(count2 = sum(count1)) %>%
      mutate(N_type = if_else(is.na(str_ex1), "N_other", "N")) %>%
      pivot_wider(id_cols = str_ex2, names_from = N_type, values_from = count2) %>%
      replace_na(list(N_other = 0)) 
    

    Output

    # A tibble: 3 x 3
      str_ex2     N N_other
      <chr>   <int>   <dbl>
    1 pum         1       0
    2 ta          3       2
    3 wam         1       2
    

    Edit (3/22/20)

    OP provided a subset of own data for testing (50 rows as suggested):

    structure(list(col1 = structure(c(2L, 3L, 57L, 4L, 47L, 47L, 
    39L, 9L, 18L, 34L, 34L, 18L, 56L, 14L, 23L, 42L, 53L, 9L, 9L, 
    17L, 45L, 1L, 30L, 30L, 18L, 52L, 9L, 2L, 2L, 2L, 2L, 2L, 2L, 
    3L, 3L, 3L, 3L, 7L, 2L, 34L, 47L, 34L, 24L, 24L, 22L, 25L, 25L, 
    34L, 34L, 34L), .Label = c("", "[is]", "[is] + [is]", "[is] + [is]+ [is]", 
    "[is] + pa", "[is] + pum tsa + [is]", "[is] pa", "[is] to go", 
    "c", "dza pam pa", "dza tsa", "gwa", "gwa pa", "gwam pa (and) tsa", 
    "gwam param pam gwam", "gwap pa", "katapum", "pa", "pa pa", "pa pa ta", 
    "pa pampa pa", "pak", "pam pa ta", "para", "puk", "pum pam pa", 
    "stn sta pa", "ta", "ta pa", "ta um pa", "tan tan pam pa", "tara", 
    "tok", "tsa", "tsa [close] da", "tsa pa", "tsa pa tsa", "tsa para pa", 
    "tsa ta", "tsak", "tsak pak", "tsan tsan tsa", "tsarara + [is]", 
    "um", "um pa", "um pa pa", "wa", "wa pa", "wa pa [is] pa", "wa sta um pa", 
    "wa wa", "wa wa pu pa pa pa", "wa wa ta pa", "wak wak", "wak wak pa pa", 
    "wam pa", "wam pam pa", "wam pam pa pa", "wam pam pam pa", "wap pa", 
    "wo", "wuk"), class = "factor"), col2 = structure(c(3L, 1L, 3L, 
    1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    1L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 1L, 
    1L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("", 
    "touch + yes", "yes", "yes (for himself)"), class = "factor"), 
        col3 = structure(c(1L, 4L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 6L, 1L, 1L, 2L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "yes", 
        "yes (hand mov)", "yes (hands mov)", "yes (head mov)", "yes (index finger)"
        ), class = "factor"), col4 = structure(c(1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L), .Label = c("", "yes"), class = "factor"), col5 = structure(c(1L, 
        1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 4L, 1L, 2L, 4L, 2L, 4L, 4L, 
        2L, 1L, 4L, 1L, 1L, 4L, 2L, 3L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 
        2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 
        1L, 2L, 1L, 2L), .Label = c("", "yes", "yes (part of it)", 
        "yes\n"), class = "factor"), col6 = structure(c(1L, 1L, 1L, 
        1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        4L, 1L), .Label = c("", "repeats the mov", "yes", "yes (see next line)"
        ), class = "factor"), col7 = structure(c(1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
        2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
        2L), .Label = c("K", "M"), class = "factor")), row.names = c(NA, 
    50L), class = "data.frame")
    

    Code was revised as follows:

    df %>%
      mutate(col1 = as.character(col1)) %>%
      select(col1) %>%
      mutate(str_ex1 = str_extract(col1, "^[[:alpha:]]{2,4}$"),
             str_ex2 = str_extract(col1, paste(str_ex1, collapse = "|")),
             count1 = str_count(col1, paste(str_ex1, collapse = "|"))) %>%
      filter(count1 > 0) %>%
      group_by(str_ex1, str_ex2) %>%
      summarise(count2 = sum(count1)) %>%
      mutate(N_type = if_else(is.na(str_ex1), "N_other", "N")) %>%
      ungroup() %>%
      mutate(str_ex1 = coalesce(str_ex1, str_ex2)) %>%
      pivot_wider(id_cols = str_ex1, names_from = N_type, values_from = count2) %>%
      replace_na(list(N_other = 0))
    

    Output

    # A tibble: 6 x 3
      str_ex1     N N_other
      <chr>   <int>   <dbl>
    1 pa          3       6
    2 pak         1       0
    3 para        2       0
    4 puk         2       0
    5 tsa         7       4
    6 wa          3      16
    

    Also note, col1 is a factor, so this was made to a plain character vector. In addition, I included select(col1) to only look at col1 and ignore other columns.