I am trying to create a matching algorithm using pulp but the results for the sample data I'm getting are wrong as I think the function is flawed.
Sample data:
users = {
1: (5.0, 4.0, 1.0, 2, 1, 1),
2: (8.0, 6.0, 2.0, 3, 2, 1)
}
dataset = pd.DataFrame([
{'id': 1, 'group': 'A', 'weight': 1},
{'id': 2, 'group': 'A', 'weight': 2},
{'id': 3, 'group': 'A', 'weight': 3},
{'id': 4, 'group': 'A', 'weight': 3},
{'id': 5, 'group': 'A', 'weight': 4},
{'id': 6, 'group': 'A', 'weight': 6},
{'id': 7, 'group': 'A', 'weight': 7},
{'id': 8, 'group': 'A', 'weight': 8},
{'id': 9, 'group': 'B', 'weight': 2},
{'d': 10, 'group': 'B', 'weight': 1}
])
I would like to match different ids to users (without repetition). For each user I have a total weight, group A weight, group B weight, unique id count, group A unique id count, group B unique id count.
For the sample above the correct answer should be:
{'id': 5, 'group': 'A', 'weight': 4, 'user_id': 1}
{'id': 10, 'group': 'B', 'weight': 1, 'user_id': 1}
{'id': 3, 'group': 'A', 'weight': 3, 'user_id': 2}
{'id': 4, 'group': 'A', 'weight': 3, 'user_id': 2}
{'id': 9, 'group': 'B', 'weight': 2, 'user_id': 2}
My first attempt:
from pulp import *
import pandas as pd
from itertools import product
def match_weights(users, dataset):
matched_rows = []
variables = LpVariable.dicts("Item", range(len(dataset)), lowBound=0, cat='Binary')
user_vars = {}
for user_id, (total_weight, group_a_weight, group_b_weight, total_unique_users, group_a_unique_users, group_b_unique_users) in users.items():
user_vars[user_id] = {}
user_vars[user_id]['total_weight'] = LpVariable("TotalWeight_{}".format(user_id), lowBound=0, upBound=total_weight)
user_vars[user_id]['group_a_weight'] = LpVariable("GroupAWeight_{}".format(user_id), lowBound=0, upBound=group_a_weight)
user_vars[user_id]['group_b_weight'] = LpVariable("GroupBWeight_{}".format(user_id), lowBound=0, upBound=group_b_weight)
user_vars[user_id]['total_unique_users'] = LpVariable("TotalUniqueUsers_{}".format(user_id), lowBound=0, upBound=total_unique_users, cat='Integer')
user_vars[user_id]['group_a_unique_users'] = LpVariable("GroupAUniqueUsers_{}".format(user_id), lowBound=0, upBound=group_a_unique_users, cat='Integer')
user_vars[user_id]['group_b_unique_users'] = LpVariable("GroupBUniqueUsers_{}".format(user_id), lowBound=0, upBound=group_b_unique_users, cat='Integer')
prob = LpProblem("MatchingProblem", LpMaximize)
prob += lpSum(variables[i] for i in range(len(dataset)))
for user_id, (total_weight, group_a_weight, group_b_weight, total_unique_users, group_a_unique_users, group_b_unique_users) in users.items():
group_a_items = dataset[dataset['group'] == 'A'].index.tolist()
group_b_items = dataset[dataset['group'] == 'B'].index.tolist()
# Total weight constraint
prob += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in range(len(dataset))) <= user_vars[user_id]['total_weight']
# Group A weight constraint
prob += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in group_a_items) <= user_vars[user_id]['group_a_weight']
# Group B weight constraint
prob += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in group_b_items) <= user_vars[user_id]['group_b_weight']
# Total unique user constraint
unique_users = set()
for i in range(len(dataset)):
if variables[i].value() == 1:
unique_users.add(dataset.loc[i, 'id'])
prob += lpSum(1 for u in unique_users) <= user_vars[user_id]['total_unique_users']
# Group A unique user constraint
unique_users_a = set()
for i in group_a_items:
if variables[i].value() == 1:
unique_users_a.add(dataset.loc[i, 'id'])
prob += lpSum(1 for u in unique_users_a) <= user_vars[user_id]['group_a_unique_users']
# Group B unique user constraint
unique_users_b = set()
for i in group_b_items:
if variables[i].value() == 1:
unique_users_b.add(dataset.loc[i, 'id'])
prob += lpSum(1 for u in unique_users_b) <= user_vars[user_id]['group_b_unique_users']
prob.solve()
for user_id, (total_weight, group_a_weight, group_b_weight, total_unique_users, group_a_unique_users, group_b_unique_users) in users.items():
matched_user_rows = []
for i in range(len(dataset)):
if variables[i].value() == 1:
matched_row = dataset.loc[i].copy()
matched_row['user_id'] = user_id
matched_user_rows.append(matched_row)
matched_rows.extend(matched_user_rows)
return matched_rows
However the results are:
{1: {'group_a': [2], 'group_b': [10]}, 2: {'group_a': [2], 'group_b': [10]}}
Looks like my results might overwrite each other but also look wrong.
I tried to rewrite it and got similar incorrect results:
def match_weights(users, dataset):
model = LpProblem("MatchingProblem", LpMaximize)
variables = LpVariable.dicts("Item", dataset.index, lowBound=0, cat='Binary')
model += lpSum(variables[i] for i in dataset.index)
# Add constraints for each user
for user_id, (total_weight, group_a_weight, group_b_weight, _, _, _) in users.items():
# Filter dataset based on user group
group_a_indices = dataset[dataset['group'] == 'A'].index
group_b_indices = dataset[dataset['group'] == 'B'].index
# Total weight constraint
model += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in dataset.index) <= total_weight
# Group A weight constraint
model += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in group_a_indices) <= group_a_weight
# Group B weight constraint
model += lpSum(variables[i] * dataset.loc[i, 'weight'] for i in group_b_indices) <= group_b_weight
unique_user_set = set(dataset['respondent_id'])
for user_id, (total_weight, _, _, total_unique_users, group_a_unique_users, group_b_unique_users) in users.items():
group_a_indices = dataset[dataset['group'] == 'A'].index
group_b_indices = dataset[dataset['group'] == 'B'].index
# Total unique users constraint
model += lpSum(variables[i] for i in dataset.index if dataset.loc[i, 'respondent_id'] in unique_user_set) \
<= total_unique_users
# Group A unique users constraint
model += lpSum(variables[i] for i in group_a_indices if dataset.loc[i, 'respondent_id'] in unique_user_set) \
<= group_a_unique_users
# Group B unique users constraint
model += lpSum(variables[i] for i in group_b_indices if dataset.loc[i, 'respondent_id'] in unique_user_set) \
<= group_b_unique_users
model.solve()
results = {}
for user_id, (_, _, _, _, _, _) in users.items():
group_a_indices = dataset[dataset['group'] == 'A'].index
group_b_indices = dataset[dataset['group'] == 'B'].index
matched_a = [dataset.loc[i, 'respondent_id'] for i in group_a_indices if variables[i].value() == 1]
matched_b = [dataset.loc[i, 'respondent_id'] for i in group_b_indices if variables[i].value() == 1]
results[user_id] = {'group_a': matched_a, 'group_b': matched_b}
return results
Where am I going wrong?
You've set an optimisation objective and optimisation sense when I believe there should be no objective (and the sense left as default). In other words, once you're done formulating, print(prob)
should include
MINIMIZE
None
You should not have variables for total weight, total unique users etc: instead, only have one decision variable kind, which is a binary assignment of whether an ID has been assigned to a user.
The "total" constraint is redundant and you can deal with the individual group values without the total.
Since you're in Pandas, so long as your data are shaped correctly, you do not need to use lpSum
; you can operate on the frames and series directly (.dot()
, .sum()
).
dataset
is a frame (good!) that needs id
to move to an index. users
should be converted from a dictionary to a frame.
import pandas as pd
import pulp
requirements = pd.DataFrame(
data={
1: (5.0, 4.0, 1.0, 2, 1, 1),
2: (8.0, 6.0, 2.0, 3, 2, 1),
},
# row index, soon to turn into column names via .T
index=pd.MultiIndex.from_product(
iterables=(('weight', 'count'), ('total', 'A', 'B')),
names=('requirement', 'group'),
),
).drop(labels='total', level='group').T # redundant
requirements.index.name = 'user'
user_values = requirements.index.to_series()
requirements = requirements.stack(level='group')
print(requirements, end='\n\n')
dataset = pd.DataFrame([
{'id': 1, 'group': 'A', 'weight': 1},
{'id': 2, 'group': 'A', 'weight': 2},
{'id': 3, 'group': 'A', 'weight': 3},
{'id': 4, 'group': 'A', 'weight': 3},
{'id': 5, 'group': 'A', 'weight': 4},
{'id': 6, 'group': 'A', 'weight': 6},
{'id': 7, 'group': 'A', 'weight': 7},
{'id': 8, 'group': 'A', 'weight': 8},
{'id': 9, 'group': 'B', 'weight': 2},
{'id': 10, 'group': 'B', 'weight': 1},
]).set_index('id')
print(dataset, end='\n\n')
combos = pd.merge(left=user_values, right=dataset.index.to_series(), how='cross')
asn_name = 'asn_u' + combos.user.astype(str) + '_i' + combos.id.astype(str)
combos['asn'] = asn_name.apply(pulp.LpVariable, cat=pulp.LpBinary)
combos = combos.set_index(['user', 'id']).asn.unstack(level='user')
print(combos, end='\n\n')
prob = pulp.LpProblem(name='user_assignment')
# Every ID can be assigned to at most one user
for user, total in combos.sum(axis=1).items():
prob.addConstraint(name=f'excl_u{user}', constraint=total <= 1)
for group, dataset_weights in dataset.groupby('group'):
group_assigns = combos.loc[dataset_weights.index]
for user, user_assigns in group_assigns.items():
requirement = requirements.loc[(user, group), :]
prob.addConstraint(
name=f'count_u{user}_g{group}',
constraint=user_assigns.sum() == requirement['count'],
)
prob.addConstraint(
name=f'weight_u{user}_g{group}',
constraint=user_assigns.dot(dataset_weights.weight) == requirement['weight'],
)
print(prob)
prob.solve()
assert prob.status == pulp.LpStatusOptimal
combos = combos.applymap(pulp.LpVariable.value).astype(int)
combos = combos[combos.any(axis=1)]
print(combos)
requirement count weight
user group
1 A 1.0 4.0
B 1.0 1.0
2 A 2.0 6.0
B 1.0 2.0
group weight
id
1 A 1
2 A 2
3 A 3
4 A 3
5 A 4
6 A 6
7 A 7
8 A 8
9 B 2
10 B 1
user 1 2
id
1 asn_u1_i1 asn_u2_i1
2 asn_u1_i2 asn_u2_i2
3 asn_u1_i3 asn_u2_i3
4 asn_u1_i4 asn_u2_i4
5 asn_u1_i5 asn_u2_i5
6 asn_u1_i6 asn_u2_i6
7 asn_u1_i7 asn_u2_i7
8 asn_u1_i8 asn_u2_i8
9 asn_u1_i9 asn_u2_i9
10 asn_u1_i10 asn_u2_i10
user_assignment:
MINIMIZE
None
SUBJECT TO
excl_u1: asn_u1_i1 + asn_u2_i1 <= 1
excl_u2: asn_u1_i2 + asn_u2_i2 <= 1
...
count_u1_gA: asn_u1_i1 + asn_u1_i2 + asn_u1_i3 + asn_u1_i4 + asn_u1_i5
+ asn_u1_i6 + asn_u1_i7 + asn_u1_i8 = 1
weight_u1_gA: asn_u1_i1 + 2 asn_u1_i2 + 3 asn_u1_i3 + 3 asn_u1_i4
+ 4 asn_u1_i5 + 6 asn_u1_i6 + 7 asn_u1_i7 + 8 asn_u1_i8 = 4
...
VARIABLES
0 <= asn_u1_i1 <= 1 Integer
0 <= asn_u1_i10 <= 1 Integer
0 <= asn_u1_i2 <= 1 Integer
0 <= asn_u1_i3 <= 1 Integer
0 <= asn_u1_i4 <= 1 Integer
0 <= asn_u1_i5 <= 1 Integer
0 <= asn_u1_i6 <= 1 Integer
0 <= asn_u1_i7 <= 1 Integer
0 <= asn_u1_i8 <= 1 Integer
0 <= asn_u1_i9 <= 1 Integer
0 <= asn_u2_i1 <= 1 Integer
0 <= asn_u2_i10 <= 1 Integer
0 <= asn_u2_i2 <= 1 Integer
0 <= asn_u2_i3 <= 1 Integer
0 <= asn_u2_i4 <= 1 Integer
0 <= asn_u2_i5 <= 1 Integer
0 <= asn_u2_i6 <= 1 Integer
0 <= asn_u2_i7 <= 1 Integer
0 <= asn_u2_i8 <= 1 Integer
0 <= asn_u2_i9 <= 1 Integer
...
Result - Optimal solution found
user 1 2
id
3 0 1
4 0 1
5 1 0
9 0 1
10 1 0