I have seen a lot of questions on this topic but none of them contain a solution that works for me yet. Here is my full code:
import pandas as pd
import requests
import time
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# Game Id
game_id = '0021900001'
# Headers for API Request
header_data = {
'Host': 'stats.nba.com',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
'Referer': 'stats.nba.com',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}
###
### Calculate Players on court at the start of each period
###
# Build advanced boxscore url
def advanced_boxscore_url(game_id, start, end):
return 'https://stats.nba.com/stats/boxscoretraditionalv2/?gameId={0}&startPeriod=0&endPeriod=14&startRange={1}&endRange={2}&rangeType=2'.format(game_id, start, end)
# Helper functions
def calculate_time_at_period(period):
if period > 5:
return (720 * 4 + (period - 5) * (5 * 60)) * 10
else:
return (720 * (period - 1)) * 10
def split_subs(df, tag):
subs = df[[tag, 'PERIOD', 'EVENTNUM']]
subs['SUB'] = tag
subs.columns = ['PLAYER_ID', 'PERIOD', 'EVENTNUM', 'SUB']
return subs
def frame_to_row(df):
team1 = df['TEAM_ID'].unique()[0]
team2 = df['TEAM_ID'].unique()[1]
players1 = df[df['TEAM_ID'] == team1]['PLAYER_ID'].tolist()
players1.sort()
players2 = df[df['TEAM_ID'] == team2]['PLAYER_ID'].tolist()
players2.sort()
lst = [team1]
lst.append(players1)
lst.append(team2)
lst.append(players2)
return lst
# extracts data from api response
def extract_data(url):
print(url)
r = requests.get(url, headers=header_data)
resp = r.json()
results = resp['resultSets'][0]
headers = results['headers']
rows = results['rowSet']
frame = pd.DataFrame(rows)
frame.columns = headers
return frame
play_by_play = pd.read_csv('pbp_v3_0210.csv')
substitutionsOnly = play_by_play[play_by_play['EVENTMSGTYPE'] == 8][['PERIOD', 'EVENTNUM', 'PLAYER1_ID', 'PLAYER2_ID']]
substitutionsOnly.columns = ['PERIOD', 'EVENTNUM', 'OUT', 'IN']
subs_in = split_subs(substitutionsOnly, 'IN')
subs_out = split_subs(substitutionsOnly, 'OUT')
full_subs = pd.concat([subs_out, subs_in], axis=0).reset_index()[['PLAYER_ID', 'PERIOD', 'EVENTNUM', 'SUB']]
first_event_of_period = full_subs.loc[full_subs.groupby(by=['PERIOD', 'PLAYER_ID'])['EVENTNUM'].idxmin()]
players_subbed_in_at_each_period = first_event_of_period[first_event_of_period['SUB'] == 'IN'][
['PLAYER_ID', 'PERIOD', 'SUB']]
periods = players_subbed_in_at_each_period['PERIOD'].drop_duplicates().values.tolist()
rows = []
for period in periods:
low = calculate_time_at_period(period) + 5
high = calculate_time_at_period(period + 1) - 5
boxscore = advanced_boxscore_url(game_id, low, high)
# time.sleep(2)
boxscore_players = extract_data(boxscore)[['PLAYER_NAME', 'PLAYER_ID', 'TEAM_ID']]
boxscore_players['PERIOD'] = period
players_subbed_in_at_period = players_subbed_in_at_each_period[players_subbed_in_at_each_period['PERIOD'] == period]
joined_players = pd.merge(boxscore_players, players_subbed_in_at_period, on=['PLAYER_ID', 'PERIOD'], how='left')
joined_players = joined_players[pd.isnull(joined_players['SUB'])][['PLAYER_NAME', 'PLAYER_ID', 'TEAM_ID', 'PERIOD']]
row = frame_to_row(joined_players)
row.append(period)
rows.append(row)
players_on_court_at_start_of_period = pd.DataFrame(rows)
cols = ['TEAM_ID_1', 'TEAM_1_PLAYERS', 'TEAM_ID_2', 'TEAM_2_PLAYERS', 'PERIOD']
players_on_court_at_start_of_period.columns = cols
holder = "00219"
for x in range(2, 10):
time.sleep(2)
if x == 707:
continue
excess = ""
if(x < 10):
excess = "0000" + str(x)
elif(x < 100):
excess = "000" + str(x)
elif(x < 1000):
excess = "00" + str(x)
else:
excess = "0" + str(x)
holder = "00219" + excess
game_id = holder
holder_play_by_play = pd.read_csv('pbp_for_parsing')
substitutionsOnly = holder_play_by_play[holder_play_by_play['EVENTMSGTYPE'] == 8][
['PERIOD', 'EVENTNUM', 'PLAYER1_ID', 'PLAYER2_ID']]
substitutionsOnly.columns = ['PERIOD', 'EVENTNUM', 'OUT', 'IN']
subs_in = split_subs(substitutionsOnly, 'IN')
subs_out = split_subs(substitutionsOnly, 'OUT')
full_subs = pd.concat([subs_out, subs_in], axis=0).reset_index()[['PLAYER_ID', 'PERIOD', 'EVENTNUM', 'SUB']]
first_event_of_period = full_subs.loc[full_subs.groupby(by=['PERIOD', 'PLAYER_ID'])['EVENTNUM'].idxmin()]
players_subbed_in_at_each_period = first_event_of_period[first_event_of_period['SUB'] == 'IN'][
['PLAYER_ID', 'PERIOD', 'SUB']]
periods = players_subbed_in_at_each_period['PERIOD'].drop_duplicates().values.tolist()
rows = []
for period in periods:
low = calculate_time_at_period(period) + 5
high = calculate_time_at_period(period + 1) - 5
boxscore = advanced_boxscore_url(game_id, low, high)
boxscore_players = extract_data(boxscore)[['PLAYER_NAME', 'PLAYER_ID', 'TEAM_ID']]
boxscore_players['PERIOD'] = period
players_subbed_in_at_period = players_subbed_in_at_each_period[
players_subbed_in_at_each_period['PERIOD'] == period]
joined_players = pd.merge(boxscore_players, players_subbed_in_at_period, on=['PLAYER_ID', 'PERIOD'], how='left')
joined_players = joined_players[pd.isnull(joined_players['SUB'])][
['PLAYER_NAME', 'PLAYER_ID', 'TEAM_ID', 'PERIOD']]
row = frame_to_row(joined_players)
row.append(period)
rows.append(row)
holder_players_on_court_at_start_of_period = pd.DataFrame(rows)
cols = ['TEAM_ID_1', 'TEAM_1_PLAYERS', 'TEAM_ID_2', 'TEAM_2_PLAYERS', 'PERIOD']
holder_players_on_court_at_start_of_period.columns = cols
players_on_court_at_start_of_period = players_on_court_at_start_of_period.concat([players_on_court_at_start_of_period, holder_players_on_court_at_start_of_period], axis=0).reset_index()[['TEAM_ID_1', 'TEAM_1_PLAYERS', 'TEAM_ID_2', 'TEAM_2_PLAYERS', 'PERIOD']]
players_on_court_at_start_of_period.to_csv("onoff0210.csv", index=False)
The full error message I am receiving is:
Traceback (most recent call last):
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\urllib3\connectionpool.py", line 672, in urlopen
chunked=chunked,
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\urllib3\connectionpool.py", line 416, in _make_request
httplib_response = conn.getresponse()
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\http\client.py", line 1344, in getresponse
response.begin()
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\http\client.py", line 306, in begin
version, status, reason = self._read_status()
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\http\client.py", line 267, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\socket.py", line 589, in readinto
return self._sock.recv_into(b)
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\urllib3\contrib\pyopenssl.py", line 318, in recv_into
raise SocketError(str(e))
OSError: (10060, 'WSAETIMEDOUT')
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\requests\adapters.py", line 449, in send
timeout=timeout
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\urllib3\connectionpool.py", line 720, in urlopen
method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\urllib3\util\retry.py", line 400, in increment
raise six.reraise(type(error), error, _stacktrace)
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\urllib3\packages\six.py", line 734, in reraise
raise value.with_traceback(tb)
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\urllib3\connectionpool.py", line 672, in urlopen
chunked=chunked,
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\urllib3\connectionpool.py", line 416, in _make_request
httplib_response = conn.getresponse()
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\http\client.py", line 1344, in getresponse
response.begin()
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\http\client.py", line 306, in begin
version, status, reason = self._read_status()
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\http\client.py", line 267, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\socket.py", line 589, in readinto
return self._sock.recv_into(b)
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\urllib3\contrib\pyopenssl.py", line 318, in recv_into
raise SocketError(str(e))
urllib3.exceptions.ProtocolError: ('Connection aborted.', OSError("(10060, 'WSAETIMEDOUT')"))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/xxxxx/PycharmProjects/NBAdata/V.3/On Off V3.py", line 100, in <module>
boxscore_players = extract_data(boxscore)[['PLAYER_NAME', 'PLAYER_ID', 'TEAM_ID']]
File "C:/Users/xxxxx/PycharmProjects/NBAdata/V.3/On Off V3.py", line 69, in extract_data
r = requests.get(url, headers=header_data)
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\requests\sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "C:\Users\xxxxx\Anaconda3\envs\NBAdata\lib\site-packages\requests\adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', OSError("(10060, 'WSAETIMEDOUT')"))
I'm not the most experienced with this kind of code, so I came to stack overflow to try to find a solution, some involving create a User-Agent (which I have already done), switching advanced LAN settings in control panel (couldn't even find advanced LAN settings, might have been removed from Windows), trying to use an online IDE (but none of the ones I have found allow me to both import a csv and have my code output to a csv when done), trying to add a timeout to my requests.get (which really only caused more errors), and maybe even some others that I am forgetting at the moment. I have also made a bunch of other files with somewhat similar formats and similar targets urls that have worked fine. And yes, my internet connection is completely fine, everything else runs smoothly, including other Python files.
This has really caused a roadblock in my current project and I can't really continue anything until I resolve this, so if anybody can come in with a magical solution that would be fantastic. The expected result of this code is that it would iterate through the for loop all the way through the maximum parameter and output a csv that would then be used as a dataframe for all of the data I had just scraped.
As it turns out, the Key Error was being raised because the value began with '00' before the rest of the string, but the dataframe read that value as an integer and chopped off those two zeroes, creating a URL that went to nowhere. After changing the datatype, the program works fine now.