Search code examples
pythonjsonweb-scrapingbeautifulsoup

scraping Json with python 3


Here is the scirpt:

from bs4 import BeautifulSoup as bs4
import requests
import json
from lxml import html
from pprint import pprint

import re


def get_data():

    url = 'https://sports.bovada.lv//baseball/mlb/game-lines-market-group'
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.103 Safari/537.36"})
    html_bytes = r.text
    soup = bs4(html_bytes, 'lxml')

    # res = soup.findAll('script') # find all scripts..

    pattern = re.compile(r"swc_market_lists\s+=\s+(\{.*?\})")
    script = soup.find("script", text=pattern)

    return script.text[23:]

test1 = get_data()

data = json.loads(test1)


for item1 in data['items']:
    data1 = item1['itemList']['items']
    for item2 in data1:
        pitch_a = item2['opponentAName']
        pitch_b = item2['opponentBName']
##            group = item2['displayGroups']
##            for item3 in group:
##                new_il = item3['itemList']
##                for item4 in new_il:
##                    market = item4['description']                  
##                    oc = item4['outcomes']
        print(pitch_a,pitch_b)


##for items in data['items']:
##    pos = items['itemList']['items']
##    for item in pos:
##        work = item['competitors']
##        pitcher_a = item['opponentAName']
##        pitcher_b = item['opponentBName']
##        group = item['displayGroups']
##        for item, item2 in zip(work,group):
##            team = item['abbreviation']
##            place = item['type']
##            il2 = item2['itemList']
##            for item in il2:
##                ml = item['description']

##            print(team,place,pitcher_a,pitcher_b,ml)

I have been trying to scrape

team abbrev = ['items']['itemList']['items']['competitors']['abbreviation']

home_away = ['items']['itemList']['items']['competitors']['type']

team pitcher home = ['items']['itemList']['items']['opponentAName']

team pitcher away = ['items']['itemList']['items']['opponentBName']

moneyline american odds = ['items']['itemList']['items']['displayGroups']['itemList']['outcomes']['price']['american']

Total runs = ['items']['itemList']['items']['displayGroups']['itemList']['outcomes']['price']['handicap']

Part of the Json pprinted:

[{'baseLink': '/baseball/mlb/game-lines-market-group',
  'defaultType': True,
  'description': 'Game Lines',
  'id': '136',
  'itemList': {'items': [{'LIVE': True,
                          'atmosphereLink': '/api/atmosphere/eventNotification/events/A/3149961',
                          'awayTeamFirst': True,
                          'baseLink': '/baseball/mlb/minnesota-twins-los-angeles-angels-201805112207',
                          'competitionId': '24736',
                          'competitors': [{'abbreviation': 'LAA',
                                           'description': 'Los Angeles Angels',
                                           'id': '3149961-1642',
                                           'rotationNumber': '978',
                                           'shortName': 'Angels',
                                           'type': 'HOME'},
                                          {'abbreviation': 'MIN',
                                           'description': 'Minnesota Twins',
                                           'id': '3149961-9990',
                                           'rotationNumber': '977',
                                           'shortName': 'Twins',
                                           'type': 'AWAY'}],
                          'denySameGame': 'NO',
                          'description': 'Minnesota Twins @ Los Angeles Angels',
                          'displayGroups': [{'baseLink': '/baseball/mlb/game-lines-market-group',
                                             'defaultType': True,
                                             'description': 'Game Lines',
                                             'id': '136',
                                             'itemList': [{'belongsToDefault': True,
                                                           'columns': 'H2Columns',
                                                           'description': 'Moneyline',
                                                           'displayGroups': '136,A-136',
                                                           'id': '46892277',
                                                           'isInRunning': True,
                                                           'mainMarketType': 'MONEYLINE',
                                                           'mainPeriod': True,
                                                           'marketTypeGroup': 'MONEY_LINE',
                                                           'notes': '',
                                                           'outcomes': [{'competitorId': '3149961-9990',
                                                                         'description': 'Minnesota '
                                                                                        'Twins',
                                                                         'id': '211933276',
                                                                         'price': {'american': '-475',
                                                                                   'decimal': '1.210526',
                                                                                   'fractional': '4/19',
                                                                                   'id': '1033002124',
                                                                                   'outcomeId': '211933276'},
                                                                         'status': 'OPEN',
                                                                         'type': 'A'},
                                                                        {'competitorId': '3149961-1642',
                                                                         'description': 'Los '
                                                                                        'Angeles '
                                                                                        'Angels',
                                                                         'id': '211933277',
                                                                         'price': {'american': '+310',
                                                                                   'decimal': '4.100',
                                                                                   'fractional': '31/10',
                                                                                   'id': '1033005679',
                                                                                   'outcomeId': '211933277'},
                                                                         'status': 'OPEN',
                                                                         'type': 'H'}],
                                                           'periodType': 'Live '
                                                                         'Match',
                                                           'sequence': '14',
                                                           'sportCode': 'BASE',
                                                           'status': 'OPEN',
                                                           'type': 'WW'},
                                                          {'belongsToDefault': True,
                                                           'columns': 'H2Columns',
                                                           'description': 'Runline',
                                                           'displayGroups': '136,A-136',
                                                           'id': '46892287',
                                                           'isInRunning': True,
                                                           'mainMarketType': 'SPREAD',
                                                           'mainPeriod': True,
                                                           'marketTypeGroup': 'SPREAD',
                                                           'notes': '',
                                                           'outcomes': [{'competitorId': '3149961-9990',
                                                                         'description': 'Minnesota '
                                                                                        'Twins',
                                                                         'id': '211933278',
                                                                         'price': {'american': '+800',
                                                                                   'decimal': '9.00',
                                                                                   'fractional': '8/1',
                                                                                   'handicap': '-1.5',
                                                                                   'id': '1033005677',
                                                                                   'outcomeId': '211933278'},
                                                                         'status': 'OPEN',
                                                                         'type': 'A'},
                                                                        {'competitorId': '3149961-1642',
                                                                         'description': 'Los '
                                                                                        'Angeles '
                                                                                        'Angels',
                                                                         'id': '211933279',
                                                                         'price': {'american': '-2000',
                                                                                   'decimal': '1.050',
                                                                                   'fractional': '1/20',
                                                                                   'handicap': '1.5',
                                                                                   'id': '1033005678',
                                                                                   'outcomeId': '211933279'},
                                                                         'status': 'OPEN',
                                                                         'type': 'H'}],
                                                           'periodType': 'Live '
                                                                         'Match',
                                                           'sequence': '14',
                                                           'sportCode': 'BASE',
                                                           'status': 'OPEN',
                                                           'type': 'SPR'}],
                                             'link': '/baseball/mlb/game-lines-market-group'}],
                          'feedCode': '13625145',
                          'id': '3149961',
                          'link': '/baseball/mlb/minnesota-twins-los-angeles-angels-201805112207',
                          'notes': '',
                          'numMarkets': 2,
                          'opponentAId': '214704',
                          'opponentAName': 'Tyler Skaggs (L)',
                          'opponentBId': '215550',
                          'opponentBName': 'Lance Lynn (R)',
                          'sport': 'BASE',
                          'startTime': 1526090820000,
                          'status': 'O',
                          'type': 'MLB'},

There are a few different loops I had started in the script above but either of them are working out the way I would like.

away team | away moneyline | away pitcher | Total Runs | and repeat for Home Team is what I would like it to be eventually. I can write to csv once it is parsed the proper way.

Thank you for the fresh set of eyes, I've been working on this for the better part of a day trying to figure out the best way to access the content I would like. If Json is not the best way and bs4 works better I would love to hear your opinion


Solution

  • There's no simple answer to your problem. Scraping data requires you to carefully assess the data you are dealing with, work out where the parts you want to extract are located and figure out how to effectively store the data you extract.

    Try printing the data in your loops to visualise what is happening in your code (or try debugging). From there its easy to figure out it if you're iterating over what you expect. Look for patterns throughout the input data to help organise the data you extract.

    To help yourself, you should give your variables descriptive names, separate your code into logical chunks and add comments when it starts to get complicated.

    Here's some working code, but I encourage you to try what I told you above, then if you're still stuck look below for guidance.

    output = {}
    root = data['items'][0]
    
    for game_line in root['itemList']['items']:
        # Create a temporary dict to store the data for this gameline
        team_data = {}
    
        # Get competitors
        competitors = game_line['competitors']
        for team in competitors:
            team_type = team['type'] # either HOME or AWAY
            # Create a new dict to store data for each team 
            team_data[team_type] = {}
            team_data[team_type]['abbreviation'] = team['abbreviation']
            team_data[team_type]['name'] = team['description']
    
    
        # Get MoneyLine and Total Runs
        for item in game_line['displayGroups'][0]['itemList']:
            for outcome in item['outcomes']:
                team_type = outcome['type'] # either A or H
                team_type = 'HOME' if team_type == 'H' else 'AWAY'
    
                if item['mainMarketType'] == 'MONEYLINE':
                    team_data[team_type]['moneyline'] = outcome['price']['american']
                elif item['mainMarketType'] == 'SPREAD':
                    team_data[team_type]['total runs'] = outcome['price']['handicap']
    
        # Get the pitchers
        team_data['HOME']['pitcher'] = game_line['opponentAName']
        team_data['AWAY']['pitcher'] = game_line['opponentBName']
    
        # For each gameline, add the teamdata we gathered to the output dict
        output[game_line['description']] = team_data
    

    This produces like:

    {
      'Atlanta Braves @ Miami Marlins': {
        'AWAY': {
          'abbreviation': 'ATL',
          'moneyline': '-130',
          'name': 'Atlanta Braves',
          'pitcher': 'Mike Soroka (R)',
          'total runs': '-1.5'
        },
        'HOME': {
          'abbreviation': 'MIA',
          'moneyline': '+110',
          'name': 'Miami Marlins',
          'pitcher': 'Jarlin Garcia (L)',
          'total runs': '1.5'
        }
      },
      'Boston Red Sox @ Toronto Blue Jays': {
        'AWAY': {
          'abbreviation': 'BOS',
          'moneyline': '-133',
          'name': 'Boston Red Sox',
          'pitcher': 'David Price (L)',
          'total runs': '-1.5'
        },
        'HOME': {
          'abbreviation': 'TOR',
          'moneyline': '+113',
          'name': 'Toronto Blue Jays',
          'pitcher': 'Marco Estrada (R)',
          'total runs': '1.5'
        }
      },
    
    }