Search code examples
rstringdataframetibbledata-extraction

find common values of a column based on group of another a column of data frame in R


I have data frame like this:

df<-tibble(id=c("ls1","ls1","ls1","ls2","ls2","ls3","ls5","ls5","ls10","ls10","ls14"),
               target=c("A","A","B","G","H","A","B","B","G","HA","B"))

I would like to have a list of common values from target column within groups of id and also between groups of id variable. The result can look like something like below table:

res<-tibble(id=c("ls1","ls1","ls1","ls2","ls2","ls3","ls5","ls5","ls10","ls10","ls14"),
            target=c("A","A","B","G","H","A","B","B","G","HA","B"),
            withinGroup=c(T,T,F,F,F,F,F,T,T,F,F),
            numberofRepwithinGroup=c(2,2,1,1,1,1,1,2,2,1,1),
            betweenGroups=c(T,T,T,T,F,T,T,T,T,F,T),
            numberofRepbetweenGroups=c(2,2,3,2,0,3,3,3,2,0,3))

Any idea how to do it?


Solution

  • You can do it with a couple of mutate():

    library(dplyr)
    
    df |>
      # first group by
      group_by(id, target) |>
      # add the within columns
      mutate(numberofRepwithinGroup = length(target),
             withinGroup            = ifelse(numberofRepwithinGroup > 1,T,F)) |> 
      # second group by
      group_by(target) |>
      # add the between columns
      mutate(numberofRepbetweenGroups = ifelse(n_distinct(id) == 1, 0, n_distinct(id)),
             betweenGroups            = ifelse(numberofRepbetweenGroups  > 0,T,F)) |>
       # reorder columns
      select(id,target, withinGroup, numberofRepwithinGroup, betweenGroups, numberofRepbetweenGroups
      ) |> 
      # remove useless grouping
      ungroup()
    
    # A tibble: 11 x 6
       id    target withinGroup numberofRepwithinGroup betweenGroups numberofRepbetweenGroups
       <chr> <chr>  <lgl>                        <int> <lgl>                            <dbl>
     1 ls1   A      TRUE                             2 TRUE                                 2
     2 ls1   A      TRUE                             2 TRUE                                 2
     3 ls1   B      FALSE                            1 TRUE                                 3
     4 ls2   G      FALSE                            1 TRUE                                 2
     5 ls2   H      FALSE                            1 FALSE                                0
     6 ls3   A      FALSE                            1 TRUE                                 2
     7 ls5   B      TRUE                             2 TRUE                                 3
     8 ls5   B      TRUE                             2 TRUE                                 3
     9 ls10  G      FALSE                            1 TRUE                                 2
    10 ls10  HA     FALSE                            1 FALSE                                0
    11 ls14  B      FALSE                            1 TRUE                                 3