Here is the scirpt:
from bs4 import BeautifulSoup as bs4
import requests
import json
from lxml import html
from pprint import pprint
import re
def get_data():
url = 'https://sports.bovada.lv//baseball/mlb/game-lines-market-group'
r = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.103 Safari/537.36"})
html_bytes = r.text
soup = bs4(html_bytes, 'lxml')
# res = soup.findAll('script') # find all scripts..
pattern = re.compile(r"swc_market_lists\s+=\s+(\{.*?\})")
script = soup.find("script", text=pattern)
return script.text[23:]
test1 = get_data()
data = json.loads(test1)
for item1 in data['items']:
data1 = item1['itemList']['items']
for item2 in data1:
pitch_a = item2['opponentAName']
pitch_b = item2['opponentBName']
## group = item2['displayGroups']
## for item3 in group:
## new_il = item3['itemList']
## for item4 in new_il:
## market = item4['description']
## oc = item4['outcomes']
print(pitch_a,pitch_b)
##for items in data['items']:
## pos = items['itemList']['items']
## for item in pos:
## work = item['competitors']
## pitcher_a = item['opponentAName']
## pitcher_b = item['opponentBName']
## group = item['displayGroups']
## for item, item2 in zip(work,group):
## team = item['abbreviation']
## place = item['type']
## il2 = item2['itemList']
## for item in il2:
## ml = item['description']
## print(team,place,pitcher_a,pitcher_b,ml)
I have been trying to scrape
team abbrev
= ['items']['itemList']['items']['competitors']['abbreviation']
home_away
= ['items']['itemList']['items']['competitors']['type']
team pitcher home
= ['items']['itemList']['items']['opponentAName']
team pitcher away
= ['items']['itemList']['items']['opponentBName']
moneyline american odds
= ['items']['itemList']['items']['displayGroups']['itemList']['outcomes']['price']['american']
Total runs
= ['items']['itemList']['items']['displayGroups']['itemList']['outcomes']['price']['handicap']
Part of the Json
pprinted
:
[{'baseLink': '/baseball/mlb/game-lines-market-group',
'defaultType': True,
'description': 'Game Lines',
'id': '136',
'itemList': {'items': [{'LIVE': True,
'atmosphereLink': '/api/atmosphere/eventNotification/events/A/3149961',
'awayTeamFirst': True,
'baseLink': '/baseball/mlb/minnesota-twins-los-angeles-angels-201805112207',
'competitionId': '24736',
'competitors': [{'abbreviation': 'LAA',
'description': 'Los Angeles Angels',
'id': '3149961-1642',
'rotationNumber': '978',
'shortName': 'Angels',
'type': 'HOME'},
{'abbreviation': 'MIN',
'description': 'Minnesota Twins',
'id': '3149961-9990',
'rotationNumber': '977',
'shortName': 'Twins',
'type': 'AWAY'}],
'denySameGame': 'NO',
'description': 'Minnesota Twins @ Los Angeles Angels',
'displayGroups': [{'baseLink': '/baseball/mlb/game-lines-market-group',
'defaultType': True,
'description': 'Game Lines',
'id': '136',
'itemList': [{'belongsToDefault': True,
'columns': 'H2Columns',
'description': 'Moneyline',
'displayGroups': '136,A-136',
'id': '46892277',
'isInRunning': True,
'mainMarketType': 'MONEYLINE',
'mainPeriod': True,
'marketTypeGroup': 'MONEY_LINE',
'notes': '',
'outcomes': [{'competitorId': '3149961-9990',
'description': 'Minnesota '
'Twins',
'id': '211933276',
'price': {'american': '-475',
'decimal': '1.210526',
'fractional': '4/19',
'id': '1033002124',
'outcomeId': '211933276'},
'status': 'OPEN',
'type': 'A'},
{'competitorId': '3149961-1642',
'description': 'Los '
'Angeles '
'Angels',
'id': '211933277',
'price': {'american': '+310',
'decimal': '4.100',
'fractional': '31/10',
'id': '1033005679',
'outcomeId': '211933277'},
'status': 'OPEN',
'type': 'H'}],
'periodType': 'Live '
'Match',
'sequence': '14',
'sportCode': 'BASE',
'status': 'OPEN',
'type': 'WW'},
{'belongsToDefault': True,
'columns': 'H2Columns',
'description': 'Runline',
'displayGroups': '136,A-136',
'id': '46892287',
'isInRunning': True,
'mainMarketType': 'SPREAD',
'mainPeriod': True,
'marketTypeGroup': 'SPREAD',
'notes': '',
'outcomes': [{'competitorId': '3149961-9990',
'description': 'Minnesota '
'Twins',
'id': '211933278',
'price': {'american': '+800',
'decimal': '9.00',
'fractional': '8/1',
'handicap': '-1.5',
'id': '1033005677',
'outcomeId': '211933278'},
'status': 'OPEN',
'type': 'A'},
{'competitorId': '3149961-1642',
'description': 'Los '
'Angeles '
'Angels',
'id': '211933279',
'price': {'american': '-2000',
'decimal': '1.050',
'fractional': '1/20',
'handicap': '1.5',
'id': '1033005678',
'outcomeId': '211933279'},
'status': 'OPEN',
'type': 'H'}],
'periodType': 'Live '
'Match',
'sequence': '14',
'sportCode': 'BASE',
'status': 'OPEN',
'type': 'SPR'}],
'link': '/baseball/mlb/game-lines-market-group'}],
'feedCode': '13625145',
'id': '3149961',
'link': '/baseball/mlb/minnesota-twins-los-angeles-angels-201805112207',
'notes': '',
'numMarkets': 2,
'opponentAId': '214704',
'opponentAName': 'Tyler Skaggs (L)',
'opponentBId': '215550',
'opponentBName': 'Lance Lynn (R)',
'sport': 'BASE',
'startTime': 1526090820000,
'status': 'O',
'type': 'MLB'},
There are a few different loops I had started in the script above but either of them are working out the way I would like.
away team | away moneyline | away pitcher | Total Runs | and repeat for Home Team is what I would like it to be eventually. I can write to csv
once it is parsed the proper way.
Thank you for the fresh set of eyes, I've been working on this for the better part of a day trying to figure out the best way to access the content I would like. If Json
is not the best way and bs4
works better I would love to hear your opinion
There's no simple answer to your problem. Scraping data requires you to carefully assess the data you are dealing with, work out where the parts you want to extract are located and figure out how to effectively store the data you extract.
Try printing the data in your loops to visualise what is happening in your code (or try debugging). From there its easy to figure out it if you're iterating over what you expect. Look for patterns throughout the input data to help organise the data you extract.
To help yourself, you should give your variables descriptive names, separate your code into logical chunks and add comments when it starts to get complicated.
Here's some working code, but I encourage you to try what I told you above, then if you're still stuck look below for guidance.
output = {}
root = data['items'][0]
for game_line in root['itemList']['items']:
# Create a temporary dict to store the data for this gameline
team_data = {}
# Get competitors
competitors = game_line['competitors']
for team in competitors:
team_type = team['type'] # either HOME or AWAY
# Create a new dict to store data for each team
team_data[team_type] = {}
team_data[team_type]['abbreviation'] = team['abbreviation']
team_data[team_type]['name'] = team['description']
# Get MoneyLine and Total Runs
for item in game_line['displayGroups'][0]['itemList']:
for outcome in item['outcomes']:
team_type = outcome['type'] # either A or H
team_type = 'HOME' if team_type == 'H' else 'AWAY'
if item['mainMarketType'] == 'MONEYLINE':
team_data[team_type]['moneyline'] = outcome['price']['american']
elif item['mainMarketType'] == 'SPREAD':
team_data[team_type]['total runs'] = outcome['price']['handicap']
# Get the pitchers
team_data['HOME']['pitcher'] = game_line['opponentAName']
team_data['AWAY']['pitcher'] = game_line['opponentBName']
# For each gameline, add the teamdata we gathered to the output dict
output[game_line['description']] = team_data
This produces like:
{
'Atlanta Braves @ Miami Marlins': {
'AWAY': {
'abbreviation': 'ATL',
'moneyline': '-130',
'name': 'Atlanta Braves',
'pitcher': 'Mike Soroka (R)',
'total runs': '-1.5'
},
'HOME': {
'abbreviation': 'MIA',
'moneyline': '+110',
'name': 'Miami Marlins',
'pitcher': 'Jarlin Garcia (L)',
'total runs': '1.5'
}
},
'Boston Red Sox @ Toronto Blue Jays': {
'AWAY': {
'abbreviation': 'BOS',
'moneyline': '-133',
'name': 'Boston Red Sox',
'pitcher': 'David Price (L)',
'total runs': '-1.5'
},
'HOME': {
'abbreviation': 'TOR',
'moneyline': '+113',
'name': 'Toronto Blue Jays',
'pitcher': 'Marco Estrada (R)',
'total runs': '1.5'
}
},
}