r optimization loss-function cost-management

Minimise cost of reallocating individuals

I have individuals that belong to different categories, they are located in different zones, these populations are expected to grow from the population value below to the demand value.

population_and_demand_by_category_and_zone <- tibble::tribble(
  ~category, ~zone, ~population, ~demand,
        "A",     1,         115,     138,
        "A",     2,         121,     145,
        "A",     3,         112,     134,
        "A",     4,          76,      91,
        "B",     1,          70,      99,
        "B",     2,          59,      83,
        "B",     3,          86,     121,
        "B",     4,         139,     196,
        "C",     1,         142,     160,
        "C",     2,          72,      81,
        "C",     3,          29,      33,
        "C",     4,          58,      66,
        "D",     1,          22,      47,
        "D",     2,          23,      49,
        "D",     3,          16,      34,
        "D",     4,          45,      96
)

Zones have a given capacity, current population is below this threshold, but demand will exceed capacity in some zones.

demand_and_capacity_by_zone <- tibble::tribble(
  ~zone, ~demand, ~capacity, ~capacity_exceeded,
      1,     444,       465,              FALSE,
      2,     358,       393,              FALSE,
      3,     322,       500,              FALSE,
      4,     449,       331,               TRUE
)

So we will need to move those individuals to a new zone (we assume we have enough total capacity). Each individual that we need to move incurs a cost, which depends on its category and destination zone. These costs are given below.

costs <- tibble::tribble(
  ~category, ~zone, ~cost,
        "A",     1,   0.1,
        "A",     2,   0.1,
        "A",     3,   0.1,
        "A",     4,   1.3,
        "B",     1,  16.2,
        "B",     2,  38.1,
        "B",     3,   1.5,
        "B",     4,   0.1,
        "C",     1,   0.1,
        "C",     2,  12.7,
        "C",     3,  97.7,
        "C",     4,  46.3,
        "D",     1,  25.3,
        "D",     2,   7.7,
        "D",     3,  67.3,
        "D",     4,   0.1
)

I wish to find the distribution of individuals across zones and categories so that the total cost is minimized. So basically have a new column new_population in the table population_and_demand_by_category_and_zone described above.

If several solutions are possible, any will do, if the result is a non integer population, that's fine.

The real use case has about 20 categories and 30 zones, so bigger but not all that big.

It seems like a problem that would be common enough so I'm hoping that there is a convenient way to solve this in R.

Solution

I've taken Erwin's formulation, modified it to consider that alloc should be more than the population for every zone and category, (which means already present individuals don't move), and implemented it using the {lpSolve} package, which doesn't require installing external system libraries.

Erwin's solution can be obtained by using move_new_only <- FALSE below.

SETUP

library(tidyverse)
library(lpSolve)

move_new_only  <- TRUE # means population in place can't be reallocated
categories     <- unique(population_and_demand_by_category_and_zone$category)
zones          <- unique(population_and_demand_by_category_and_zone$zone)
n_cat          <- length(categories)
n_zones        <- length(zones)

# empty coefficient arrays
move_coefs_template  <- array(0, c(n_zones, n_zones, n_cat), 
                              dimnames = list(zones, zones, categories))
alloc_coefs_template <- matrix(0, n_zones, n_cat, 
                               dimnames = list(zones, categories))

build_zone_by_category_matrix <- function(data, col) {
  data %>% 
    pivot_wider(
      id_cols = zone, names_from = category, values_from = {{col}}) %>% 
    as.data.frame() %>% 
    `row.names<-`(.$zone) %>% 
    select(-zone) %>% 
    as.matrix()
}

demand_mat <- build_zone_by_category_matrix(
  population_and_demand_by_category_and_zone, demand)

cost_mat <- build_zone_by_category_matrix(costs, cost)

population_mat <- build_zone_by_category_matrix(
  population_and_demand_by_category_and_zone, population)

OBJECTIVE FUNCTION : total cost

# stack the cost matrix vertically to build an array of all move coefficients
coefs_obj <- move_coefs_template
for(i in 1:n_zones) {
  coefs_obj[i,,] <- cost_mat
}
# flatten it for `lp`s `objective.in` argument, adding alloc coefs
f.obj <- c(coefs_obj, alloc_coefs_template)

CONSTRAINT 1 : allocation = demand + inflow - outflow

coefs_con <- list() 
for (z in zones) {
  coefs_con_zone <- list() 
  for(categ in categories) {
    coefs_arrivals <- move_coefs_template
    coefs_arrivals[,z, categ] <- 1
    coefs_departures <- move_coefs_template
    coefs_departures[z,, categ] <- 1
    coefs_moves <- coefs_arrivals - coefs_departures
    coefs_alloc <- alloc_coefs_template
    coefs_alloc[z, categ] <- -1
    # flatten the array
    coefs_con_zone[[categ]] <- c(coefs_moves, coefs_alloc)
  }
  coefs_con[[z]] <- do.call(rbind, coefs_con_zone)
}

# stack the flattened arrays to build a matrix
f.con1 <- do.call(rbind, coefs_con)
f.dir1 <- rep("==", n_zones * n_cat)
f.rhs1 <- -c(t(demand_mat)) # transposing so we start with all zone 1 and so on

CONSTRAINT 2 : Allocation never exceeds capacity

coefs_con <- list() 
for (z in zones) {
  coefs_alloc <- alloc_coefs_template
  coefs_alloc[z, ] <- 1
  coefs_con[[z]] <- c(move_coefs_template, coefs_alloc)
}

# stack the flattened arrays to build a matrix
f.con2 <- do.call(rbind, coefs_con)
f.dir2 <- rep("<=", n_zones)

f.rhs2 <- demand_and_capacity_by_zone$capacity

CONSTRAINT 3 : Allocation >= Population

i.e. we don't move people that were already there.

If we decide we can move them the rule becomes Allocation >= 0, and we get Erwin's answer.

coefs_con <- list() 
for (z in zones) {
  coefs_con_zone <- list() 
  for(categ in categories) {
    coefs_alloc <- alloc_coefs_template
    coefs_alloc[z, categ] <- 1
    # flatten the array
    coefs_con_zone[[categ]] <- c(move_coefs_template, coefs_alloc)
  }
  coefs_con[[z]] <- do.call(rbind, coefs_con_zone)
}

# stack the flattened arrays to build a matrix
f.con3 <- do.call(rbind, coefs_con)
f.dir3 <- rep(">=", n_zones * n_cat)

if (move_new_only) {
  f.rhs3 <- c(t(population_mat))
} else {
  f.rhs3 <- rep(0, n_zones * n_cat)
}

CONCATENATE OBJECTS

f.con <- rbind(f.con1, f.con2, f.con3)
f.dir <- c(f.dir1, f.dir2, f.dir3)
f.rhs <- c(f.rhs1, f.rhs2, f.rhs3)

SOLVE

# compute the solution and visualize it in the array
results_raw <- lp("min", f.obj, f.con, f.dir, f.rhs)$solution
results_moves <- move_coefs_template
results_moves[] <- 
  results_raw[1:length(results_moves)]
results_allocs <- alloc_coefs_template
results_allocs[] <- 
  results_raw[length(results_moves)+(1:length(results_allocs))]
results_moves
#> , , A
#> 
#>    1 2 3 4
#> 1  0 0 0 0
#> 2  0 0 3 0
#> 3  0 0 0 0
#> 4 13 0 2 0
#> 
#> , , B
#> 
#>   1 2  3 4
#> 1 0 0  0 0
#> 2 0 0  0 0
#> 3 0 0  0 0
#> 4 0 0 57 0
#> 
#> , , C
#> 
#>   1 2 3 4
#> 1 0 0 0 0
#> 2 0 0 0 0
#> 3 0 0 0 0
#> 4 8 0 0 0
#> 
#> , , D
#> 
#>   1  2 3 4
#> 1 0  0 0 0
#> 2 0  0 0 0
#> 3 0  0 0 0
#> 4 0 38 0 0

results_allocs
#>     A   B   C  D
#> 1 151  99 168 47
#> 2 142  83  81 87
#> 3 139 178  33 34
#> 4  76 139  58 58

TIDY RESULTS

# format as tidy data frame
results_df <-
  as.data.frame.table(results_moves) %>% 
  setNames(c("from", "to", "category", "n")) %>% 
  filter(n != 0) %>% 
  mutate_at(c("from", "to"), as.numeric) %>% 
  mutate_if(is.factor, as.character)

results_df
#>   from to category  n
#> 1    4  1        A 13
#> 2    2  3        A  3
#> 3    4  3        A  2
#> 4    4  3        B 57
#> 5    4  1        C  8
#> 6    4  2        D 38

UPDATE TABLES

population_and_demand_by_category_and_zone <-
  bind_rows(
  results_df %>% 
  group_by(category, zone = to) %>% 
  summarize(correction = sum(n), .groups = "drop"),
  results_df %>% 
    group_by(category, zone = from) %>% 
    summarize(correction = -sum(n), .groups = "drop"),
  ) %>% 
  left_join(population_and_demand_by_category_and_zone, ., by = c("category", "zone")) %>% 
  replace_na(list(correction =0)) %>% 
  mutate(new_population = demand + correction)

population_and_demand_by_category_and_zone
#> # A tibble: 16 × 6
#>    category  zone population demand correction new_population
#>    <chr>    <dbl>      <dbl>  <dbl>      <dbl>          <dbl>
#>  1 A            1        115    138      13               151
#>  2 A            2        121    145      -3.00            142
#>  3 A            3        112    134       5.00            139
#>  4 A            4         76     91     -15.0              76
#>  5 B            1         70     99       0                99
#>  6 B            2         59     83       0                83
#>  7 B            3         86    121      57               178
#>  8 B            4        139    196     -57               139
#>  9 C            1        142    160       8               168
#> 10 C            2         72     81       0                81
#> 11 C            3         29     33       0                33
#> 12 C            4         58     66      -8                58
#> 13 D            1         22     47       0                47
#> 14 D            2         23     49      38                87
#> 15 D            3         16     34       0                34
#> 16 D            4         45     96     -38                58


demand_and_capacity_by_zone <-
  population_and_demand_by_category_and_zone %>% 
  group_by(zone) %>% 
  summarise(population = sum(population), correction = sum(correction), new_population = sum(new_population)) %>% 
  left_join(demand_and_capacity_by_zone, ., by = "zone")
#> `summarise()` ungrouping output (override with `.groups` argument)
  
demand_and_capacity_by_zone
#> # A tibble: 4 × 7
#>    zone demand capacity capacity_exceeded population correction new_population
#>   <dbl>  <dbl>    <dbl> <lgl>                  <dbl>      <dbl>          <dbl>
#> 1     1    444      465 FALSE                    349         21            465
#> 2     2    358      393 FALSE                    275         35            393
#> 3     3    322      500 FALSE                    243         62            384
#> 4     4    449      331 TRUE                     318       -118            331

We see that the population never decreases and stays under capacity.