Search code examples
rsubset

How to subset two data frames based on the common values in two columns in R


I have two data frames and I need to find the rows that are common across both based on the values in two columns (i.e., V1, and V2).

df1 <- V1(X,X,Y,Z,Z)
       V2(Q,E,W,Q,Q)
       V3(D,D,Y,V,J)
       V4(O,Z,A,Q,E)

df2 <- V1(X,B,Y,A,Z)
       V2(Q,E,W,Q,Q)
       V3(M,U,C,D,D)
       V4(P,U,I,V,D)

I would like to subset both data frames so that only rows including common values remain. Here you can see what I aim to have at the end.

df1_subset <- V1(X,Y,Z)
              V2(Q,W,Q)
              V3(D,Y,J)
              V4(O,A,E)

df2_subset <- V1(X,Y,Z)
              V2(Q,W,Q)
              V3(M,C,D)
              V4(P,I,D)

I would highly appreciate your help.


Solution

  • Repex data:

    df1 <- data.frame(
      "V1" = c("X","X","Y","Z","Z"),
      "V2" = c("Q","E","W","Q","Q"),
      "V3" = c("D","D","Y","V","J"),
      "V4" = c("O","Z","A","Q","E")
    )
    
    df2 <- data.frame(
      "V1" = c("X","B","Y","A","Z"),
      "V2" = c("Q","E","W","Q","Q"),
      "V3" = c("M","U","C","D","D"),
      "V4" = c("P","U","I","V","D")
    )
    

    Option 1: Using base R interaction

    idx1 <- interaction(df1[,c("V1", "V2")]) %in% interaction(df2[,c("V1", "V2")])
    idx2 <- interaction(df2[,c("V1", "V2")]) %in% interaction(df1[,c("V1", "V2")])
    
    df1_subset <- df1[idx1,]
    #    V1 V2 V3 V4
    # 1  X  Q  D  O
    # 3  Y  W  Y  A
    # 4  Z  Q  V  Q
    # 5  Z  Q  J  E
    df2_subset <- df2[idx2,]
    #    V1 V2 V3 V4
    # 1  X  Q  M  P
    # 3  Y  W  C  I
    # 5  Z  Q  D  D
    

    Option 2: Using dplyr semi-join

    library(dplyr)
    df1_subset_dplyr <- semi_join(df1, df2, by = c("V1", "V2"))
    #    V1 V2 V3 V4
    # 1  X  Q  D  O
    # 2  Y  W  Y  A
    # 3  Z  Q  V  Q
    # 4  Z  Q  J  E
    df2_subset_dplyr <- semi_join(df2, df1, by = c("V1", "V2"))
    #    V1 V2 V3 V4
    # 1  X  Q  M  P
    # 2  Y  W  C  I
    # 3  Z  Q  D  D
    

    Or another option of choice.