Kernel keeps dying in Jupyter notebook with pulp solver

I've created a LP solver in Jupyter notebooks that is giving me some issues. Specifically, when I run the last line of code in the script below, I get the error message saying The kernel appears to have died. It will restart automatically.

Edit: the final dataframe, dfs_proj, is a 240-row, 5-column dataframe.

import pandas as pd
from pulp import *
from pulp import LpMaximize

dfs_proj = pd.read_csv("4for4_dfs_projections_120321.csv")
dfs_proj['count'] = 1
cols = ['Player', 'Pos', 'FFPts', 'DK ($)', 'count']
dfs_proj = dfs_proj[cols]
dfs_proj = dfs_proj[(dfs_proj['DK ($)'] >= 4000) | (dfs_proj['Pos'] == "DEF") | (dfs_proj['Pos'] == "TE")]

player_dict = dict(zip(dfs_proj['Player'], dfs_proj['count']))

# create a helper function to return the number of players assigned each position
def get_position_sum(player_vars, df, position):
    return pulp.lpSum([player_vars[i] * (position in df['Pos'].iloc[i]) for i in range(len(df))])

def get_optimals(site, data, num_lineups, optimize_on='FFPts'):
    """
    Generates x number of optimal lineups, based on the column to
    designate as the one to optimize on.
    :param str site: DK or FD. Used for salary constraints
    :param pd.DataFrame data: Pandas dataframe containing projections.
    :param int num_lineups: Number of lineups to generate.
    :param str optimize_on: Name of column in dataframe to use when optimizing
    """
    #global lineups
    lineups = []
    player_dict = dict(zip(data['Player'], data['count']))
    for i in range(1, num_lineups+1):
        prob = pulp.LpProblem('DK_NFL_weekly', pulp.const.LpMaximize)
        player_vars = []
        for row in data.itertuples():
            var = pulp.LpVariable(f'{row.Player}', cat='Binary')
            player_vars.append((row.Player, var))
        # total assigned players constraint
        prob += pulp.lpSum(player_var for player_var in player_vars) == 9
        # total salary constraint
        prob += pulp.lpSum(data['DK ($)'].iloc[i] * player_vars[i][1] for i in range(len(data))) <= 50000
        # for QB and DST, require 1 of each in the lineup
        prob += get_position_sum(player_vars, df, 'QB') == 1
        prob += get_position_sum(player_vars, df, 'DEF') == 1
        
        # to account for the FLEX position, we allow additional selections of the 3 FLEX-eligible positions: RB, WR, TE
        prob += get_position_sum(player_vars, df, 'RB') >= 2
        prob += get_position_sum(player_vars, df, 'WR') >= 3
        prob += get_position_sum(player_vars, df, 'TE') >= 1
        if i > 1:
            if optimize_on == 'Optimal Frequency':
                prob += pulp.lpSum([data['FFPts'].iloc[i] * player_vars[i][1] for i in range(len(data))]) <= (optimal - 0.001)
            else:
                prob += pulp.lpSum([data['FFPts'].iloc[i] * player_vars[i][1] for i in range(len(data))]) <= (optimal - 0.01)
        
        prob += pulp.lpSum([data['FFPts'].iloc[i] * player_vars[i][1] for i in range(len(data))])
        # solve and print the status
        prob.solve(PULP_CBC_CMD(msg=False))
        optimal = prob.objective.value()
        count = 1
        lineup = {}
        for i in range(len(data)):    
            if player_vars[i][1].value() == 1:
                row = data.iloc[i]
                lineup[f'G{count}'] = row['Player']
                count += 1
            lineup['Total Points'] = optimal
        
        lineups.append(lineup)
        players = list(lineup.values())
        for i in range(0, len(players)):
            if type(players[i]) == str:
                player_dict[players[i]] += 1
                if player_dict[players[i]] == 45:
                    data = data[data['Player'] != players[i]]
    return lineups

lineups = get_optimals(dfs_proj, 20, 'FFPts')

I have tried reinstalling all the libraries that are used in the script and still get the same issue. Even running it in a normal Python script gives me the same error message. I think this might have to do with memory, but I'm not sure how to check for that or adjust for that, either.

Thanks in advance for any help!

Solution

You had a handful of typos here... Not sure if/how you got this running.

A couple of issues you had:

You co-mingled df and data variable names inside your function. So who knows what that was pulling in. (One of the hazards of working in a notebook.)
In several locations where you used player_vars you were not indexing the tuple to get the variable piece, I'd suggest you use the LpVariable.dicts() for these, it is easier to manage.
Your function call doesn't account for site in the function params.

Other advice:

Do NOT turn off the messaging. You must check the solver output to see the status. First attempts came back as "infeasible" which is how I discovered the player_vars problem. If you do decide to turn off the message, figure out a way to assert(status==optimal) or risk junk results. I think it is doable in pulp, I just forgot how. Edit: here's how. This works when using the default CBC solver, after solving (obviously). Other solvers, not sure:
```
  status = LpStatus[prob.status]
  assert(status=='Optimal')
```
print out the problem a couple times to see if it passes the giggle test while building it. If you had done this, you would have seen some of the construction problems.

Anyhow, this is working fine for fake data and handles 1000+ players in a couple seconds for 20 lineups.

Buyer beware: I did not review all of the constraints too closely or the conditional constraint, so you should.

import pandas as pd
from pulp import *
# from pulp import LpMaximize
from random import randint, choice

num_players = 1000
positions = ['RB', 'WR', 'TE', 'DEF', 'QB']
players = [(i, choice(positions), randint(1,100), randint(3000,5000), 1) for i in range(num_players)]
cols = ['Player', 'Pos', 'FFPts', 'DK ($)', 'count']
dfs_proj = pd.DataFrame.from_records(players, columns = cols)
print(dfs_proj.head())


# dfs_proj = pd.read_csv("4for4_dfs_projections_120321.csv")
# dfs_proj['count'] = 1
# cols = ['Player', 'Pos', 'FFPts', 'DK ($)', 'count']
# dfs_proj = dfs_proj[cols]

dfs_proj = dfs_proj[(dfs_proj['DK ($)'] >= 4000) | (dfs_proj['Pos'] == "DEF") | (dfs_proj['Pos'] == "TE")]

# player_dict = dict(zip(dfs_proj['Player'], dfs_proj['count']))

print(dfs_proj.head())

# create a helper function to return the number of players assigned each position
def get_position_sum(player_vars, df, position):
    return pulp.lpSum([player_vars[i][1] * (position in df['Pos'].iloc[i]) for i in range(len(df))])  #player vars not indexed

#def get_optimals(site, data, num_lineups, optimize_on='FFPts'):   # site???  # data vs df ???
def get_optimals(data, num_lineups, optimize_on='FFPts'):
    """
    Generates x number of optimal lineups, based on the column to
    designate as the one to optimize on.
    :param str site: DK or FD. Used for salary constraints
    :param pd.DataFrame data: Pandas dataframe containing projections.
    :param int num_lineups: Number of lineups to generate.
    :param str optimize_on: Name of column in dataframe to use when optimizing
    """
    #global lineups
    lineups = []
    player_dict = dict(zip(data['Player'], data['count']))
    for i in range(1, num_lineups+1):
        prob = pulp.LpProblem('DK_NFL_weekly', pulp.const.LpMaximize)
        player_vars = []
        for row in data.itertuples():
            var = pulp.LpVariable(f'P{row.Player}', cat='Binary')  # added 'P' to player name for clarity
            player_vars.append((row.Player, var))
        # total assigned players constraint
        prob += pulp.lpSum(player_var[1] for player_var in player_vars) == 9    # player var not indexed
        # total salary constraint
        prob += pulp.lpSum(data['DK ($)'].iloc[i] * player_vars[i][1] for i in range(len(data))) <= 50000
        # for QB and DST, require 1 of each in the lineup

        # !!!!  you had 'df' here which who knows what you were pulling in....  changed to data

        prob += get_position_sum(player_vars, data, 'QB') == 1
        prob += get_position_sum(player_vars, data, 'DEF') == 1
        
        # to account for the FLEX position, we allow additional selections of the 3 FLEX-eligible positions: RB, WR, TE
        prob += get_position_sum(player_vars, data, 'RB') >= 2
        prob += get_position_sum(player_vars, data, 'WR') >= 3
        prob += get_position_sum(player_vars, data, 'TE') >= 1
        if i > 1:
            if optimize_on == 'Optimal Frequency':
                prob += pulp.lpSum([data['FFPts'].iloc[i] * player_vars[i][1] for i in range(len(data))]) <= (optimal - 0.001)
            else:
                prob += pulp.lpSum([data['FFPts'].iloc[i] * player_vars[i][1] for i in range(len(data))]) <= (optimal - 0.01)
        
        prob += pulp.lpSum([data['FFPts'].iloc[i] * player_vars[i][1] for i in range(len(data))])
        print(prob)
        # solve and print the status
        prob.solve(PULP_CBC_CMD())
        optimal = prob.objective.value()
        count = 1
        lineup = {}
        for i in range(len(data)):    
            if player_vars[i][1].value() == 1:
                row = data.iloc[i]
                lineup[f'G{count}'] = row['Player']
                count += 1
            lineup['Total Points'] = optimal
        
        lineups.append(lineup)
        players = list(lineup.values())
        for i in range(0, len(players)):
            if type(players[i]) == str:
                player_dict[players[i]] += 1
                if player_dict[players[i]] == 45:
                    data = data[data['Player'] != players[i]]
    return lineups

lineups = get_optimals(dfs_proj, 10, 'FFPts')
for lineup in lineups:
    print(lineup)