Search code examples
rdplyrsapply

Check the unique values of 2 character variables match using set_equal in R


I have 2 data tables with about ~500k obs. and ~50 variables. Some of these variables contain characters and I would like to find a simple, quick way of checking the unique values of each character variable in one data table matches (or doesn't) the corresponding variable in the second data table

I assumed that using sapply and set_equal would give me my answer, but it always returns FALSE for each variable, even when I know the answer should be TRUE. Testing an individual pair of variables, returns the correct answer. I could loop over each pair of variables, and this works, but I'm trying to understand why my sapply/set_equal method doesn't and fix it so it does (or find an alternative non-looping method).

> # Extract & sort unique values from character vars in cube1 for matching character vars
> cube1c <- sapply(cube1m[, ..commonCharCols], unique)
> cube1c <- sapply(cube1c, sort)

> # Extract & sort unique values from character vars in cube2 for matching character vars
> cube2c <- sapply(cube2m[, ..commonCharCols], unique)
> cube2c <- sapply(cube2c, sort)

> # Test if values in each pair of variables are the same
> sapply(cube1c, function(x) setequal(x, cube2c)) 
cVar1 cVar2 cVar3 cVar4 cVar5 cVar6
FALSE FALSE FALSE FALSE FALSE FALSE

> setequal(cube1c$cVar2, cube2c$cVar2)
[1] TRUE

> for (icol in seq_len(length(commonCharCols))) {
+     print(commonCharCols[[icol]])
+     print(setequal(cube1c[[icol]], cube2c[[icol]]))
+ }
[1] "cVar1"
[1] FALSE
[1] "cVar2"
[1] TRUE
[1] "cVar3"
[1] FALSE
[1] "cVar4"
[1] TRUE
[1] "cVar5"
[1] TRUE
[1] "cVar6"
[1] FALSE

Solution

  • Consider mapply for elementwise loop of equal length objects to call setequal:

    output <- mapply(setequal, cube1c, cube2c)
    

    To demonstrate with random, seeded data

    Data

    library(data.table)
    
    set.seed(872019)
    data_tools <- c("sas", "stata", "spss", "python", "r", "julia")
    cube1m <- data.table(replicate(50, sample(data_tools, 10, replace=TRUE)))
    
    head(cube1m)    
    #       V1     V2     V3    V4     V5
    # 1:     r python   spss julia    sas
    # 2: julia   spss python julia python
    # 3: stata      r    sas stata      r
    # 4:     r      r  julia julia    sas
    # 5: julia      r    sas  spss      r
    # 6: stata      r      r     r    sas
    
    cube2m <- data.table(replicate(50, sample(data_tools, 10, replace=TRUE)))
    
    head(cube2m)
    #        V1    V2     V3     V4    V5
    # 1:    sas  spss python      r stata
    # 2:      r  spss  julia    sas     r
    # 3:  julia julia  stata python julia
    # 4:      r     r   spss  stata julia
    # 5:  julia     r  stata python   sas
    # 6: python     r  stata    sas stata
    

    Code + Output

    commonCharCols <- paste0("V", 1:50)
    cube1c <- sapply(cube1m[, ..commonCharCols], function(x) sort(unique(x)))        
    cube2c <- sapply(cube2m[, ..commonCharCols], function(x) sort(unique(x)))
    
    output <- mapply(setequal, cube1c, cube2c)
    
    output
    #    V1    V2    V3    V4    V5    V6    V7    V8    V9   V10   V11   V12   V13   V14   V15 
    # FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE 
    #   V16   V17   V18   V19   V20   V21   V22   V23   V24   V25   V26   V27   V28   V29   V30 
    # FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
    #   V31   V32   V33   V34   V35   V36   V37   V38   V39   V40   V41   V42   V43   V44   V45 
    #  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE 
    #   V46   V47   V48   V49   V50 
    # FALSE FALSE FALSE FALSE FALSE
    
     output[output == TRUE]
     #   V6  V13  V20  V21  V23  V31  V33  V39 
     # TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
    

    Check

    sharedCols <- names(output[output == TRUE])
    
    cube1m[, ..sharedCols]    
    #         V6    V13    V20    V21    V23    V31    V33    V39
    #  1:   spss  julia      r   spss python python python   spss
    #  2:    sas    sas    sas  stata   spss   spss  stata      r
    #  3:    sas  julia      r      r python      r    sas  julia
    #  4:   spss      r  julia   spss  stata      r  stata      r
    #  5: python   spss  stata   spss      r  stata  stata  julia
    #  6:    sas python      r  julia    sas  julia python python
    #  7:      r python    sas python  stata  julia   spss   spss
    #  8:    sas    sas python  stata      r python    sas  julia
    #  9:  julia  stata      r   spss  julia      r python    sas
    # 10:  stata  julia   spss    sas    sas   spss  julia    sas
    
    cube2m[, ..sharedCols]    
    #         V6    V13    V20    V21    V23    V31    V33    V39
    #  1:  julia  julia      r  stata    sas   spss  stata  julia
    #  2: python  julia  julia    sas python  julia python   spss
    #  3:    sas  stata    sas  julia   spss python  stata  julia
    #  4: python  stata    sas      r      r python    sas      r
    #  5:   spss   spss    sas   spss   spss      r  julia python
    #  6:  stata python   spss  julia      r  julia python   spss
    #  7: python    sas python python  julia  julia  stata    sas
    #  8:    sas      r      r    sas    sas  stata   spss   spss
    #  9:  julia    sas  stata    sas  stata python python  julia
    # 10:      r  stata  julia      r   spss      r  stata      r