Search code examples
rdataframecountuniqueunique-values

From two data frames, is there any way to count unique values of a column in R?


From a tsv file, I have two tables.

For example,

table 1: V_one

readId V_segment
abc IGHV3-18-F(2695)
ghi IGHV3-9-F(2840)

and

table 2: V_two

readId V_segment
aaa IGHV3-9-F(2714),IGHV3-12-F(2656)
bbb IGHV3-10-F(2666),IGHV3-18-F(2666)

And I need to create another table where, for each unique value in the column "V_segment" of these new two tables, I get a count of the times every unique value appears in the column "V_segment" in the tsv file.

For example:

V_segment Count
IGHV3-18-F 3
IGHV3-9-F 15
IGHV3-9-F,IGHV3-12-F 4
IGHV3-10-F,IGHV3-18-F 7

I tried this code:

df <- read_tsv('file.tsv') # tsv file with lots of columns
segmV <- data.frame(df$readId, df$V_segment, stringsAsFactors = FALSE) # create a data frame with two columns of the tsv file
V_one <- segmV[!grepl(",", df$V_segment),] # table 1
V_two <- segmV[grepl(",", df$V_segment),] # table 2

v1 <- V_one %>%
      group_by(V_one$V_segment) %>%
      mutate(Count = n_distinct(V_one$V_segment))

But it's not working, since (for this) I need to ignore the information in the parenthesis.

> dput(head(V_one, n=10))
structure(list(readId = c("abc", "def", 
"ghi", "jkl", "mno", "pqr", "stu", "vwy", "zab", "cde"), 
    V_segment = c("IGHV3-18-F(2695)", "IGHV3-18-F(2782)", 
    "IGHV3-18-F(2772)", "IGHV3-18-F(1952)", "IGHV3-15-F(2792)", 
    "IGHV3-18-F(2558)", "IGHV3-9-F(2831)", "IGHV3-18-F(2221)", 
    "IGHV3-18-F(2812)", "IGHV3-15-F(2791)")), row.names = c(1L, 
2L, 4L, 9L, 11L, 14L, 17L, 19L, 24L, 25L), class = "data.frame")

> dput(head(V_two, n=10))
structure(list(readId = c("aaa", "bbb", "ccc", "ddd", "eee", "fff", 
"ggg", "hhh", "iii", "jjj"), 
    V_segment = c("IGHV3-10-F(2429),IGHV3-12-F(2429)", 
    "IGHV3-9-F(2714),IGHV3-12-F(2656)", "IGHV1-16-ORF(2920),IGHV1-19-F(2920)", 
    "IGHV3-17-F(2512),IGHV3-18-F(2512)", "IGHV3-10-F(2666),IGHV3-18-F(2666)", 
    "IGHV1-8-F(2901),IGHV1-21-F(2814)", "IGHV3-10-F(2685),IGHV3-18-F(2685)", 
    "IGHV3-12-F(2801),IGHV3-20-F(2743)", "IGHV3-20-F(2754),IGHV3-10-F(2725)", 
    "IGHV3-10-F(2714),IGHV3-18-F(2714)")), row.names = c(3L, 
5L, 6L, 7L, 8L, 10L, 12L, 13L, 15L, 16L), class = "data.frame")

> dput(head(V_table3, n=10))
structure(list(readId = c("aaa", "bbb", "ccc", "ddd", "eee", "fff", 
 "ggg", "hhh", "iii", "jjj"), 
        V_segment = c("IGHV3-10-F(242,9),IGHV3-12-F(242,9)", 
        "IGHV3-9-F(271,4),IGHV3-12-F(265,6)", "IGHV1-16-ORF(292,0),IGHV1-19-F(2920)", 
        "IGHV3-17-F(251,2),IGHV3-18-F(2512)", "IGHV3-10-F(2666),IGHV3-18-F(2666)", 
        "IGHV1-8-F(290,1),IGHV1-21-F(281,4)", "IGHV3-10-F(2685),IGHV3-18-F(2685)", 
        "IGHV3-12-F(2801),IGHV3-20-F(274,3)", "IGHV3-20-F(2754),IGHV3-10-F(272,5)", 
        "IGHV3-10-F(2714),IGHV3-18-F(271,4)")), row.names = c(3L, 
    5L, 6L, 7L, 8L, 10L, 12L, 13L, 15L, 16L), class = "data.frame")

Could you help me please?

Thank you in advance!


Solution

  • library(dplyr)
    
    bind_rows(mget(ls(pattern = "V_"))) %>%  
      count(V_segment)
    
    # A tibble: 4 × 2
      V_segment                             n
      <chr>                             <int>
    1 IGHV3-10-F(2666),IGHV3-18-F(2666)     1
    2 IGHV3-18-F(2695)                      1
    3 IGHV3-9-F(2714),IGHV3-12-F(2656)      1
    4 IGHV3-9-F(2840)                       1
    

    Individual segments:

    bind_rows(mget(ls(pattern = "V_"))) %>% 
      mutate(across(V_segment, ~ str_remove_all(.x, "\\(.*?\\)"))) %>%  
      count(V_segment)
    
    # A tibble: 4 × 2
      V_segment                 n
      <chr>                 <int>
    1 IGHV3-10-F,IGHV3-18-F     1
    2 IGHV3-18-F                1
    3 IGHV3-9-F                 1
    4 IGHV3-9-F,IGHV3-12-F      1
    

    With the provided sample data:

    V_one <- structure(list(readId = c("abc", "def", 
                              "ghi", "jkl", "mno", "pqr", "stu", "vwy", "zab", "cde"), 
                   V_Segment = c("IGHV3-18-F(2695)", "IGHV3-18-F(2782)", 
                                 "IGHV3-18-F(2772)", "IGHV3-18-F(1952)", "IGHV3-15-F(2792)", 
                                 "IGHV3-18-F(2558)", "IGHV3-9-F(2831)", "IGHV3-18-F(2221)", 
                                 "IGHV3-18-F(2812)", "IGHV3-15-F(2791)")), row.names = c(1L, 
                                                                                         2L, 4L, 9L, 11L, 14L, 17L, 19L, 24L, 25L), class = "data.frame") %>% 
      as_tibble()
    
    V_one %>%  
      mutate(V_Segment = str_remove(V_Segment, "\\(.*?\\)")) %>%
      count(V_Segment)
    
    # A tibble: 3 × 2
      V_Segment      n
      <chr>      <int>
    1 IGHV3-15-F     2
    2 IGHV3-18-F     7
    3 IGHV3-9-F      1