Search code examples
ralgorithmsubset-sum

Checking if a set of numbers can be added up to a given value


I have a table like this:

set.seed(123)
random_table <- data.frame(
  Column1 = sample(1:10, 5, replace = TRUE),
  Column2 = sample(1:10, 5, replace = TRUE),
  Column3 = sample(1:10, 5, replace = TRUE),
  Column4 = sample(1:10, 5, replace = TRUE),
  Column5 = sample(1:10, 5, replace = TRUE)
)


 Column1 Column2 Column3 Column4 Column5
       3       5       5       3       9
       3       4       3       8       3
      10       6       9      10       4
       2       9       9       7       1
       6      10       9      10       7

I want to make a function that checks if "n" numbers from this table can sum to some value "m". When this is possible, I want to record all such combinations (else NULL).

I tried to write a function to do this using the combinat library:

library(combinat)

    find_combinations <- function(table, num, target_sum) {

    combinations <- combn(as.vector(as.matrix(table)), num)

    valid_combinations <- list()
    valid_cells <- list()

    for (i in 1:ncol(combinations)) {
        if (sum(combinations[, i]) == target_sum) {
            valid_combinations <- append(valid_combinations, list(combinations[, i]))
        
            cells <- c()
            for (value in combinations[, i]) {
            cell <- which(table == value, arr.ind = TRUE)[1, ]
            cells <- c(cells, paste0(LETTERS[cell[2]], cell[1]))
        }
        valid_cells <- append(valid_cells, list(cells))
    }
}

if (length(valid_combinations) > 0) {
    result <- data.frame(
        id = seq_along(valid_combinations),
        sum = rep(target_sum, length(valid_combinations)),
        numbers_selected = sapply(valid_combinations, function(x) paste(x, collapse = ",")),
        cells = sapply(valid_cells, function(x) paste(x, collapse = ","))
    )
} else {
    result <- data.frame(
        id = NA,
        sum = NA,
        numbers_selected = NA,
        cells = NA
    )
}

return(result)

}

I then called the function for a specific example and removed all duplicates (relative to the cells column):

result <- find_combinations(random_table, num = 4, target_sum = 19)

result$sorted_cells <- sapply(strsplit(result$cells, ","), function(x) paste(sort(x), collapse = ","))
result <- result[!duplicated(result$sorted_cells), ]
result$sorted_cells <- NULL

result$id <- seq_len(nrow(result))

The output looks like this:

 id sum numbers_selected       cells
  1  19         3,3,10,3 A1,A1,A3,A1
  2  19          3,3,6,7 A1,A1,A5,D4
  3  19          3,3,5,8 A1,A1,B1,D2
  4  19          3,3,4,9 A1,A1,B2,B4
  5  19         3,10,2,4 A1,A3,A4,B2

Are there any standard ways to do this in R (ex: Finding all possible combinations of numbers from a vector to reach a given sum (No repetitions), Getting all the combination of numbers from a list that would sum to a specific number)? Or do we really have to write a function?


Solution

  • This is a restricted subset sum problem. Take a look at RcppAlgos::partitionsGeneral (also see this vignette).

    library(RcppAlgos)
    
    result <- partitionsGeneral(unlist(random_table), 4, target = 19)
    dim(result)
    #> [1] 328   4
    
    all(rowSums(result) == 19)
    #> [1] TRUE
    

    A function to get indices:

    f <- function(df, num, target_sum, linear = TRUE) {
      x <- unlist(df, 1, 1)
      vals <- unique(partitionsGeneral(x, num, target = target_sum))
      idx <- array(split(seq_along(x), x)[as.character(vals)], dim(vals))
      out <- do.call(rbind, lapply(asplit(idx, 1), comboGrid, repetition = FALSE))
      if (!(linear || is.vector(df))){
        out[] <- outer(1:nrow(df), LETTERS[1:ncol(df)], \(a, b) paste0(b, a))[out]
      }
      out
    }
    

    Testing:

    result_idx <- f(random_table, 4, 19)
    dim(result_idx)
    #> [1] 552   4
    
    # check that each row corresponds to a set of indices of values that sum correctly
    x <- result_idx
    x[] <- unlist(df, 1, 1)[result_idx]
    all(rowSums(x) == 19)
    #> [1] TRUE
    
    f(random_table, 4, 19, FALSE)[1:5,]
    #>      Var1 Var2 Var3 Var4
    #> [1,] "E4" "A4" "A5" "A3"
    #> [2,] "E4" "A4" "A5" "B5"
    #> [3,] "E4" "A4" "A5" "D3"
    #> [4,] "E4" "A4" "A5" "D5"
    #> [5,] "E4" "A4" "B3" "A3"
    

    Note that for this example, result and result_idx don't have the same number of rows. partitionsGeneral stops early once it has identified all unique outputs (see the comment below from the package's author, Joseph Wood).