Search code examples
pythonscrapy

Scrapy Collecting Data From Table


I dont get errors from the script below but this script returns no data. I am trying to get all the games for each of the weeks which start in table 4 in the html. When I enter the xpath commands in the scrapy shell I get data but once I put in the parse definition I dont get anything in return.

import scrapy


class NFLOddsSpider(scrapy.Spider):
    name = 'NFLOdds'
    allowed_domains = ['www.sportsoddshistory.com']
    start_urls = ['https://www.sportsoddshistory.com/nfl-game-season/?y=2022']

    def parse(self, response):
        
        for row in response.xpath('//table[@class="soh1"]//tbody/tr'):

            day = row.xpath('td[1]//text()').extract_first()
            date = row.xpath('td[2]//text()').extract_first()
            time = row.xpath('td[3]//text()').extract_first()
            AtFav = row.xpath('td[4]//text()').extract_first()
            favorite = row.xpath('td[5]//text()').extract_first()
            score = row.xpath('td[6]//text()').extract_first()
            spread = row.xpath('td[7]//text()').extract_first()
            AtDog = row.xpath('td[8]//text()').extract_first()
            underdog = row.xpath('td[9]//text()').extract_first()
            OvUn = row.xpath('td[10]//text()').extract_first()
            notes = row.xpath('td[11]//text()').extract_first()
            week = row.xpath('//*[@id="content"]/div/table[4]/tbody/tr/td/h3').extract_first()

            oddsTable = {
                'day': day,
                'date': date,
                'time': time,
                'AtFav': AtFav,
                'favorite': favorite,
                'score': score,
                'spread': spread,
                'AtDog': AtDog,
                'underdog': underdog,
                'OvUn': OvUn,
                'notes': notes,
                'week' : week
            }
            yield oddsTable

Solution

  • Updating answer to include Playoffs table; code below

    scrapy runspider NFLOddsSpider.py -O output.csv to execute

    import re
    import scrapy
    
    class NFLOddsSpider(scrapy.Spider):
        name = 'NFLOdds'
        allowed_domains = ['www.sportsoddshistory.com']
        start_urls = ['https://www.sportsoddshistory.com/nfl-game-season/?y=2022']
    
        def parse(self, response):
            # Find all H3 tags
            for h3 in response.xpath('//h3'):
                h3_text = h3.xpath('./text()').get().strip()
    
                # Find headings for Week/Playoffs tables
                if (m := re.match(r'(\d{4}) Regular Season - Week (\d{1,2})', h3_text)):
                    # Week
                    year = m.group(1)
                    week = m.group(2)
    
                    # Pick out the one TABLE that immediatelly follows the H3 header
                    for tab in h3.xpath('.//following-sibling::table[1]'):
                        # Pick out the rows from that table
                        for row in tab.xpath('./tbody[1]/tr'):
                            yield {
                                'year': year,
                                'week': week,
                                'round': None,
                                'day': row.xpath('td[1]//text()').get(),
                                'date': row.xpath('td[2]//text()').get(),
                                'time_et': row.xpath('td[3]//text()').get(),
                                'fav_at': row.xpath('td[4]//text()').get(),
                                'favorite': row.xpath('td[5]//text()').get(),
                                'fav_cover_spread': bool(row.xpath('td[5]//a/b/text()').get()),
                                'score': row.xpath('td[6]//text()').get(),
                                'spread': row.xpath('td[7]//text()').get(),
                                'und_at': row.xpath('td[8]//text()').get(),
                                'underdog': row.xpath('td[9]//text()').get(),
                                'und_cover_spread': bool(row.xpath('td[9]//a/b/text()').get()),
                                'over_under': row.xpath('td[10]//text()').get(),
                                'notes': row.xpath('td[11]//text()').get()
                            }
                elif (m := re.match(r'(\d{4}) Playoffs', h3_text)):
                    # Playoffs
                    year = m.group(1)
                    week = None
    
                    # Pick out the one TABLE that immediatelly follows the H3 header
                    for tab in h3.xpath('.//following-sibling::table[1]'):
                        # Pick out the rows from that table
                        for row in tab.xpath('./tbody[1]/tr'):
                            yield {
                                'year': year,
                                'week': week,
                                'round': row.xpath('td[1]//text()').get(),
                                'day': row.xpath('td[2]//text()').get(),
                                'date': row.xpath('td[3]//text()').get(),
                                'time_et': row.xpath('td[4]//text()').get(),
                                'fav_at': row.xpath('td[5]//text()').get(),
                                'favorite': row.xpath('td[6]//text()').get(),
                                'fav_cover_spread': bool(row.xpath('td[6]//a/b/text()').get()),
                                'score': row.xpath('td[7]//text()').get(),
                                'spread': row.xpath('td[8]//text()').get(),
                                'und_at': row.xpath('td[9]//text()').get(),
                                'underdog': row.xpath('td[10]//text()').get(),
                                'und_cover_spread': bool(row.xpath('td[10]//a/b/text()').get()),
                                'over_under': row.xpath('td[11]//text()').get(),
                                'notes': None
                            }
    
    

    CSV data screenshot: enter image description here