Search code examples
ralgorithmperformancecombinationsenumeration

Efficiently enumerate all subsets with difference constraints in R


I have a vector V of consecutive integers of length l , e.g., 1, 2, 3, 4, 5, 6, 7. I want to find all subsets of size k such that the difference between any two numbers in the subset can be no less than m, e.g., 2. Using the example above with l = 7, k = 3, and m = 2, the subsets are

1, 3, 5
1, 3, 6
1, 3, 7
1, 4, 6
1, 4, 7
1, 5, 7
2, 4, 6
2, 4, 7
2, 5, 7
3, 5, 7

One approach is to enumerate all possible subsets of size k and discard any that fail to meet the m constraint, but this procedure explodes even when the number of solutions is small.

My current approach is a brute-force algorithm in which I start from the subset with the lowest possible integer and increase the last entry by 1 until the upper limit is reached, increment the previous entry and reset the last entry to the lowest it can be given the increase in the previous entry. That is, I start with 1, 3, 5, then increase the last digit by one to get 1, 3, 6 and 1, 3, 7, and then since the upper limit is reached I increment the middle by 1 (to 4) and set the last digit to the smallest it can be given that digit (to 6) to get 1, 4, 6, and carry on as such. This ends up being quite slow in R for large l, and I'm wondering if there is a clever vectorized solution that can instantly generate all the combinations, possible by capitalizing on the sequential nature of the entries. Implementing this algorithm in Rcpp speeds this up a bit, but I'm still hoping for a more elegant solution if one is available.


Solution

  • Here are several base R options

    • Recursion

    We can define recursive function like below

    f0 <- function(v, k, m) {
        if (k == 1) {
            return(matrix(v))
        }
        d <- Recall(v, k - 1, m)
        u <- unique(d[, ncol(d)])
        uu <- (u + m)[(u + m) %in% v]
        lst <- list()
        for (i in u) {
            dd <- d[d[, ncol(d)] == i, , drop = FALSE]
            p <- uu[uu - i >= m]
            if (length(p) > 0) {
                lst <- append(
                    lst,
                    list(cbind(dd[rep(1:nrow(dd), each = length(p)), , drop = FALSE], p))
                )
            }
        }
        unname(do.call(rbind, lst))
    }
    

    and we can obtain

    > f0(v = 1:7, k = 3, m = 2)
          [,1] [,2] [,3]
     [1,]    1    3    5
     [2,]    1    3    6
     [3,]    1    3    7
     [4,]    1    4    6
     [5,]    1    4    7
     [6,]    2    4    6
     [7,]    2    4    7
     [8,]    1    5    7
     [9,]    2    5    7
    [10,]    3    5    7
    
    > f0(v = 1:10, k = 3, m = 2)
          [,1] [,2] [,3]
     [1,]    1    3    5
     [2,]    1    3    6
     [3,]    1    3    7
     [4,]    1    3    8
     [5,]    1    3    9
     [6,]    1    3   10
     [7,]    1    4    6
     [8,]    1    4    7
     [9,]    1    4    8
    [10,]    1    4    9
    [11,]    1    4   10
    [12,]    2    4    6
    [13,]    2    4    7
    [14,]    2    4    8
    [15,]    2    4    9
    [16,]    2    4   10
    [17,]    1    5    7
    [18,]    1    5    8
    [19,]    1    5    9
    [20,]    1    5   10
    [21,]    2    5    7
    [22,]    2    5    8
    [23,]    2    5    9
    [24,]    2    5   10
    [25,]    3    5    7
    [26,]    3    5    8
    [27,]    3    5    9
    [28,]    3    5   10
    [29,]    1    6    8
    [30,]    1    6    9
    [31,]    1    6   10
    [32,]    2    6    8
    [33,]    2    6    9
    [34,]    2    6   10
    [35,]    3    6    8
    [36,]    3    6    9
    [37,]    3    6   10
    [38,]    4    6    8
    [39,]    4    6    9
    [40,]    4    6   10
    [41,]    1    7    9
    [42,]    1    7   10
    [43,]    2    7    9
    [44,]    2    7   10
    [45,]    3    7    9
    [46,]    3    7   10
    [47,]    4    7    9
    [48,]    4    7   10
    [49,]    5    7    9
    [50,]    5    7   10
    [51,]    1    8   10
    [52,]    2    8   10
    [53,]    3    8   10
    [54,]    4    8   10
    [55,]    5    8   10
    [56,]    6    8   10
    
    • for-loops approach

    You can simply run with for loops like below

    f1 <- function(v, k, m) {
        res <- matrix(v)
        p <- v
        for (i in 1:(k - 1)) {
            q <- (p + m)[(p + m) %in% v]
            lst <- list()
            for (j in 1:nrow(res)) {
                s <- q[q - res[j, i] >= m]
                if (length(s) > 0) {
                    lst <- append(
                        lst,
                        list(cbind(res[rep(j, each = length(s)), , drop = FALSE], s))
                    )
                }
            }
            res <- unname(do.call(rbind, lst))
            p <- q
        }
        res
    }
    

    and will obtain

    > f1(v = 1:7, k = 3, m = 2)
          [,1] [,2] [,3]
     [1,]    1    3    5
     [2,]    1    3    6
     [3,]    1    3    7
     [4,]    1    4    6
     [5,]    1    4    7
     [6,]    2    4    6
     [7,]    2    4    7
     [8,]    1    5    7
     [9,]    2    5    7
    [10,]    3    5    7
    
    > f1(v = 1:10, k = 3, m = 2)
          [,1] [,2] [,3]
     [1,]    1    3    5
     [2,]    1    3    6
     [3,]    1    3    7
     [4,]    1    3    8
     [5,]    1    3    9
     [6,]    1    3   10
     [7,]    1    4    6
     [8,]    1    4    7
     [9,]    1    4    8
    [10,]    1    4    9
    [11,]    1    4   10
    [12,]    2    4    6
    [13,]    2    4    7
    [14,]    2    4    8
    [15,]    2    4    9
    [16,]    2    4   10
    [17,]    1    5    7
    [18,]    1    5    8
    [19,]    1    5    9
    [20,]    1    5   10
    [21,]    2    5    7
    [22,]    2    5    8
    [23,]    2    5    9
    [24,]    2    5   10
    [25,]    3    5    7
    [26,]    3    5    8
    [27,]    3    5    9
    [28,]    3    5   10
    [29,]    1    6    8
    [30,]    1    6    9
    [31,]    1    6   10
    [32,]    2    6    8
    [33,]    2    6    9
    [34,]    2    6   10
    [35,]    3    6    8
    [36,]    3    6    9
    [37,]    3    6   10
    [38,]    4    6    8
    [39,]    4    6    9
    [40,]    4    6   10
    [41,]    1    7    9
    [42,]    1    7   10
    [43,]    2    7    9
    [44,]    2    7   10
    [45,]    3    7    9
    [46,]    3    7   10
    [47,]    4    7    9
    [48,]    4    7   10
    [49,]    5    7    9
    [50,]    5    7   10
    [51,]    1    8   10
    [52,]    2    8   10
    [53,]    3    8   10
    [54,]    4    8   10
    [55,]    5    8   10
    [56,]    6    8   10
    
    • Reduce approach

    Another option is using Reduce, which iteratively increases the dimension of the resulting data.frame by adding eligible elements from v as a new column.

    f2 <- function(v, k, m) {
        helper <- function(df, v) {
            u <- unique(df[[length(df)]])
            v <- (u + m)[(u + m) %in% v]
            grp <- split(df, df[length(df)])
            lst <- lapply(
                grp,
                \(x) {
                    p <- v[v - x[[length(x)]][1] >= 2]
                    if (length(p) > 0) {
                        cbind(x[rep(1:nrow(x), each = length(p)), ], p)
                    }
                }
            )
            as.data.frame(`row.names<-`(unname(do.call(rbind, lst)), NULL))
        }
        Reduce(helper, rep(list(v), k - 1), init = as.data.frame(v))
    }
    

    and you will obtain

    > f2(v = 1:7, k = 3, m = 2)
            
    1  1 3 5
    2  1 3 6
    3  1 3 7
    4  1 4 6
    5  1 4 7
    6  2 4 6
    7  2 4 7
    8  1 5 7
    9  2 5 7
    10 3 5 7
    
    > f2(v = 1:10, k = 3, m = 2)
    
    1  1 3  5
    2  1 3  6
    3  1 3  7
    4  1 3  8
    5  1 3  9
    6  1 3 10
    7  1 4  6
    8  1 4  7
    9  1 4  8
    10 1 4  9
    11 1 4 10
    12 2 4  6
    13 2 4  7
    14 2 4  8
    15 2 4  9
    16 2 4 10
    17 1 5  7
    18 1 5  8
    19 1 5  9
    20 1 5 10
    21 2 5  7
    22 2 5  8
    23 2 5  9
    24 2 5 10
    25 3 5  7
    26 3 5  8
    27 3 5  9
    28 3 5 10
    29 1 6  8
    30 1 6  9
    31 1 6 10
    32 2 6  8
    33 2 6  9
    34 2 6 10
    35 3 6  8
    36 3 6  9
    37 3 6 10
    38 4 6  8
    39 4 6  9
    40 4 6 10
    41 1 7  9
    42 1 7 10
    43 2 7  9
    44 2 7 10
    45 3 7  9
    46 3 7 10
    47 4 7  9
    48 4 7 10
    49 5 7  9
    50 5 7 10
    51 1 8 10
    52 2 8 10
    53 3 8 10
    54 4 8 10
    55 5 8 10
    56 6 8 10
    

    Benchmarking

    The benchmark includes all existing answers to this question

    v <- 1:20
    k <- 4
    m <- 2
    microbenchmark(
        f_AC = f(v, k, m),    # Allan Cameron's solution
        f_TIC0 = f0(v, k, m), # ThomasIsCoding's solution 0
        f_TIC1 = f1(v, k, m), # ThomasIsCoding's solution 1
        f_TIC2 = f2(v, k, m), # ThomasIsCoding's solution 2
        times = 20L,
        unit = "relative"
    )
    

    and we see

    Unit: relative
       expr       min       lq     mean    median        uq      max neval
       f_AC 82.677137 69.78411 43.01843 83.497758 81.922518 6.791794    20
     f_TIC0  1.000000  1.00000  1.00000  1.000000  1.000000 1.000000    20
     f_TIC1  7.731772  7.74679  4.75455  8.012004  7.592652 1.316215    20
     f_TIC2 19.764325 16.68923 11.65485 17.963988 24.911318 2.359821    20
    

    For a pressure test with v <- 1:100, k <- 5 and m <- 2, the performance of recursions f (by @Allan Cameron) and f0 (by @ThomasIsCoding) are shown as below

    > system.time(
    +     res <- f0(v = 1:100, k = 5, m = 2)
    + )
       user  system elapsed 
       3.33    1.99    5.39
    
    > system.time(
    +     res <- f(v = 1:100, k = 5, m = 2)
    + )
       user  system elapsed 
     146.70    4.17  157.02