I am trying to scrape this web page with scrapy and I can get all the data I am needing besides the distance. The link https://www.thedogs.com.au/racing/albion-park/2024-05-30/10/tab-flying-amy-classic-h?trial=false
The distance is 520m. How do I get it to scrape this value? Please see the bold code below.
rules = (
Rule(LinkExtractor(restrict_xpaths="//td[@class='meetings-venues__race-time']/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = {}
hxs = Selector(response)
divs = hxs.xpath('//tr[@class="accordion__anchor race-runner"]')
# titles = [hxs.select('//tr[@class="index class_tr group-6487"] | //tr[@class="index class_tr group-6488"] | //tr[@class="index class_tr group-6489"]')]
for div in divs:
item = {
'grade' : div.xpath(".//td[@class='race-runners__grade']/text()").extract(),
'greyhound' : div.xpath('./td[3]/div[1]/a/text()').extract(),
'position' : div.xpath('./td[1]/text()').extract(),
'trainer' : div.xpath(".//div[@class='race-runners__name__trainer']/a/text()").extract(),
'weight' : div.xpath(".//td[@class='race-runners__weight']/text()").extract(),
'first_sec' : div.xpath(".//td[@class='race-runners__sectional']/text()").extract_first(),
'second_sec' : div.xpath(".//td[@class='race-runners__sectional'][2]/text()").extract(),
'time' : div.xpath(".//td[@class='race-runners__time']/text()").extract(),
'margin' : div.xpath(".//td[@class='race-runners__margin']/text()").extract(),
***'distance' : div.xpath(".//div[@class='race-header__info__grade']/a/text()").extract(),***
'starting_price' : div.xpath(".//td[@class='race-runners__starting-price']/text()").extract(),
'date' : response.url.split('/')[-3],
'track' : response.url.split('/')[-4],
'rug' : div.xpath('.//td[@class="table__cell--tight race-runners__box"]/sprite-svg/@name').get()
#'rug' : div.xpath('//td[@class="table__cell--tight race-runners__box"]/sprite-svg/@name').extract()
}
yield item
Grab the distance outside of the loop. I'm using CSS but you can equally use XPath.
from scrapy import Spider, Request
class MySpider(Spider):
name = "thedogs"
start_urls = ["https://www.thedogs.com.au/racing/albion-park/2024-05-30/10/tab-flying-amy-classic-h?trial=false"]
def start_requests(self):
for url in self.start_urls:
yield Request(url, self.parse)
async def parse(self, response):
# Get distance once outside of loop.
distance = response.css(".race-header__info__grade::text").get()
divs = response.xpath('//tr[@class="accordion__anchor race-runner"]')
for div in divs:
yield {
'distance' : distance,
'grade' : div.xpath(".//td[@class='race-runners__grade']/text()").extract(),
'greyhound' : div.xpath('./td[3]/div[1]/a/text()').extract(),
'position' : div.xpath('./td[1]/text()').extract(),
'trainer' : div.xpath(".//div[@class='race-runners__name__trainer']/a/text()").extract(),
'weight' : div.xpath(".//td[@class='race-runners__weight']/text()").extract(),
'first_sec' : div.xpath(".//td[@class='race-runners__sectional']/text()").extract_first(),
'second_sec' : div.xpath(".//td[@class='race-runners__sectional'][2]/text()").extract(),
'time' : div.xpath(".//td[@class='race-runners__time']/text()").extract(),
'margin' : div.xpath(".//td[@class='race-runners__margin']/text()").extract(),
'starting_price' : div.xpath(".//td[@class='race-runners__starting-price']/text()").extract(),
'date' : response.url.split('/')[-3],
'track' : response.url.split('/')[-4],
'rug' : div.xpath('.//td[@class="table__cell--tight race-runners__box"]/sprite-svg/@name').get()
}
Output:
{'distance': 'OPEN 520m', 'grade': ['5'], 'greyhound': ['Pronouns'], 'position': ['1st'], 'trainer': ['T: Tony Brett'], 'weight': ['26.60'], 'first_sec': '5.54', 'second_sec': ['17.09'], 'time': ['29.77'], 'margin': [], 'starting_price': ['$5.50'], 'date': '2024-05-30', 'track': 'albion-park', 'rug': 'rug_1'}
{'distance': 'OPEN 520m', 'grade': ['4'], 'greyhound': ['Cindy Keeping'], 'position': ['2nd'], 'trainer': ['T: Charmaine Roberts'], 'weight': ['28.90'], 'first_sec': '5.58', 'second_sec': ['17.27'], 'time': ['30.05'], 'margin': ['4.00'], 'starting_price': ['$8.50'], 'date': '2024-05-30', 'track': 'albion-park', 'rug': 'rug_8'}
{'distance': 'OPEN 520m', 'grade': ['5'], 'greyhound': ['Excavation'], 'position': ['3rd'], 'trainer': ['T: Jason Thompson'], 'weight': ['28.70'], 'first_sec': '5.56', 'second_sec': ['17.49'], 'time': ['30.21'], 'margin': ['6.25'], 'starting_price': ['$11.00'], 'date': '2024-05-30', 'track': 'albion-park', 'rug': 'rug_4'}
{'distance': 'OPEN 520m', 'grade': ['5'], 'greyhound': ['Which Trap'], 'position': ['4th'], 'trainer': ['T: John Dart'], 'weight': ['32.30'], 'first_sec': '5.67', 'second_sec': ['17.59'], 'time': ['30.32'], 'margin': ['7.75'], 'starting_price': ['$101.00'], 'date': '2024-05-30', 'track': 'albion-park', 'rug': 'rug_6'}
{'distance': 'OPEN 520m', 'grade': ['5'], 'greyhound': ['Mackenna'], 'position': ['5th'], 'trainer': ['T: Michelle Sultana'], 'weight': ['28.20'], 'first_sec': '5.54', 'second_sec': ['17.71'], 'time': ['30.52'], 'margin': ['10.50'], 'starting_price': ['$2.10'], 'date': '2024-05-30', 'track': 'albion-park', 'rug': 'rug_7'}
{'distance': 'OPEN 520m', 'grade': ['4'], 'greyhound': ['Super Scrub'], 'position': ['6th'], 'trainer': ['T: Travis Elson'], 'weight': ['32.80'], 'first_sec': '5.57', 'second_sec': ['17.31'], 'time': ['30.57'], 'margin': ['11.50'], 'starting_price': ['$8.00'], 'date': '2024-05-30', 'track': 'albion-park', 'rug': 'rug_2'}
{'distance': 'OPEN 520m', 'grade': ['4'], 'greyhound': ["History's Coming"], 'position': ['7th'], 'trainer': ['T: Tomas Rees'], 'weight': ['33.60'], 'first_sec': '5.58', 'second_sec': ['17.43'], 'time': ['30.61'], 'margin': ['12.00'], 'starting_price': ['$5.00'], 'date': '2024-05-30', 'track': 'albion-park', 'rug': 'rug_3'}