Search code examples
rmathematical-optimizationcombinatorics

Create a combination of factors with optimization


library(dplyr)
library(tidyr)

df <- data.frame(
  First = c("MW3", "MW3", "MW4", "MW5", "MW6", "MW7", "MW7", "MW8"),
  Second = c("MW4; MW5; MW6", "MW5; MW3; MW7", "MW8; MW7; MW3",
             "MW5; MW6; MW4", "MW3; MW7; MW8", "MW6; MW8; MW4",
             "MW3; MW4; MW5", "MW6; MW3; MW7")
)

df <- df %>%
  mutate(
    ID = row_number(),
    lmt = n_distinct(ID)
  ) %>%
  separate_rows(Second, sep = "; ") %>%
  group_by(ID) %>%
  mutate(
    wgt = row_number()
  ) %>% ungroup()

Let's say that for each ID I want to keep only 1 combination of First and Second (i.e. the length of unique IDs in df should always be equal to lmt).

However, I'd like to do that with optimizing certain parameters. The solution should be designed in such a way that:

  • Combinations with wgt 1 should be selected whenever possible, alternatively also 2, but 3 should be avoided (i.e. sum of wgt should be minimal);

  • The difference between the frequency of a value in Second and frequency in First should be close to 0.

Any ideas on how to approach this in R?

Expected output for the above case is:

     ID First Second   wgt   lmt
1     1   MW3    MW4     1     8
2     2   MW3    MW7     3     8
3     3   MW4    MW7     2     8
4     4   MW5    MW5     1     8
5     5   MW6    MW3     1     8
6     6   MW7    MW8     2     8
7     7   MW7    MW3     1     8
8     8   MW8    MW6     1     8

Why? Simply because with this combination, there is not more of any element on the right side (Second) that it is on the left (First). For example, there are two MW3 elements on the right as well as on the left.

However, the price to pay here is that wgt is not always 1 (sum of wgt is not 8 but 12).

Clarification: In case both criteria cannot be minimized at the same time, the minimization of 2nd criteria (difference between frequencies) should be prioritized.


Solution

  • I played around with this problem and I can share a solution using a variation of minconflicts algorithm. The key here is to find a scoring function that combines your requirements. The implementation below follows your recommendation 'let's say the objective should be to prioritize the minimization of 2nd criteria (difference between frequencies)'. Experiment with other scoring functions on your actual data and let's see how far you get.

    On your original data (8 IDs) I got solution equally good as the one you posted:

    > solution_summary(current_solution)
       Name FirstCount SecondCount diff
    1:  MW3          2           2    0
    2:  MW4          1           1    0
    3:  MW5          1           1    0
    4:  MW6          1           1    0
    5:  MW7          2           2    0
    6:  MW8          1           1    0
    [1] "Total freq diff:  0"
    [1] "Total wgt:  12"
    

    With random data with 10000 IDs the algorithm is able to find solution with no difference in First/Second frequencies (but sum of wgt is bigger than minimum):

    > solution_summary(current_solution)
       Name FirstCount SecondCount diff
    1:  MW3       1660        1660    0
    2:  MW4       1762        1762    0
    3:  MW5       1599        1599    0
    4:  MW6       1664        1664    0
    5:  MW7       1646        1646    0
    6:  MW8       1669        1669    0
    [1] "Total freq diff:  0"
    [1] "Total wgt:  19521"
    

    Code below:

    library(data.table)
    df <- as.data.table(df)
    df <- df[, .(ID, First, Second, wgt)]
    
    # PLAY AROUND WITH THIS PARAMETER
    freq_weight <- 0.9
    
    wgt_min <- df[, uniqueN(ID)]
    wgt_max <- df[, uniqueN(ID) * 3]
    
    freq_min <- 0
    freq_max <- df[, uniqueN(ID) * 2] #verify if this is the worst case scenario
    
    score <- function(solution){
      # compute raw scores
      current_wgt <- solution[, sum(wgt)]
      second_freq <- solution[, .(SecondCount = .N), by = Second]
      names(second_freq)[1] <- "Name"
      compare <- merge(First_freq, second_freq, by = "Name", all = TRUE)
      compare[is.na(compare)] <- 0
      compare[, diff := abs(FirstCount - SecondCount)]
      current_freq <- compare[, sum(diff)]
    
      # normalize
      wgt_score <- (current_wgt - wgt_min) / (wgt_max - wgt_min)
      freq_score <- (current_freq - freq_min) / (freq_max - freq_min)
    
      #combine
      score <- (freq_weight * freq_score) + ((1 - freq_weight) * wgt_score)
      return(score)
    }
    
    #initialize random solution
    current_solution <- df[, .SD[sample(.N, 1)], by = ID]
    
    #get freq of First (this does not change)
    First_freq <- current_solution[, .(FirstCount = .N), by = First]
    names(First_freq)[1] <- "Name"
    
    #get mincoflict to be applied on each iteration
    minconflict <- function(df, solution){
      #pick ID
      change <- solution[, sample(unique(ID), 1)]
    
      #get permissible values
      values <- df[ID == change, .(Second, wgt)]
    
      #assign scores
      values[, score := NA_real_]
      for (i in 1:nrow(values)) {
        solution[ID == change, c("Second", "wgt") := values[i, .(Second, wgt)]]
        set(values, i, "score", score(solution))
      }
    
      #return the best combination
      scores <<- c(scores, values[, min(score)])
      solution[ID == change, c("Second", "wgt") := values[which.min(score), .(Second, wgt)]]
    }
    
    #optimize
    scores <- 1
    iter <- 0
    while(TRUE){
      minconflict(df, current_solution)
      iter <- iter + 1
      #SET MAX NUMBER OF ITERATIONS HERE
      if(scores[length(scores)] == 0 | iter >= 1000) break
    }
    
    # summarize obtained solution
    solution_summary <- function(solution){
      second_freq <- solution[, .(SecondCount = .N), by = Second]
      names(second_freq)[1] <- "Name"
      compare <- merge(First_freq, second_freq, by = "Name", all = TRUE)
      compare[is.na(compare)] <- 0
      compare[, diff := abs(FirstCount - SecondCount)]
      print(compare)
      print(paste("Total freq diff: ", compare[, sum(diff)]))
      print(paste("Total wgt: ", solution[, sum(wgt)]))
    }
    solution_summary(current_solution)