I'm trying to scrape a fairly straightforward website with a Scrapy BaseSpider since I know in advance where all of the links that I want to crawl are.
The basic layout of the site to be crawled is
I can successfully navigate and get data at all 4 levels, however, my county field is not being populated correctly. For a given agency, instead of the actual county it is in, I get the last county in the State the agency is located in.
Example:
Can't seem to figure out something that I think is relatively simple.
Here's the code:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from agencyspider.items import AgencyItem
from scrapy.http import Request
class BasicspiderSpider(BaseSpider):
name = "basicSpider"
allowed_domains = ["usacops.com"]
start_urls = [
'http://www.usacops.com/',
]
items = {}
def parse(self, response):
sel = Selector(response)
states = sel.xpath('//comment()[.=" Begin State Names "]/following::table[1]/tr/td/a')
for s in states:
item = AgencyItem()
state = s.xpath('text()').extract()[0]
url = s.xpath('@href').extract()[0]
item['state'] = state
item['stateUrl']= url
yield Request(url=url,callback=self.parse_counties,meta={'item':item})
def parse_counties(self, response):
sel = Selector(response)
counties = sel.xpath('//comment()[.=" Begin Counties "]/following::table[1]/tr/td/font/a | //comment()[.=" Begin Counties "]/following::table[1]/tr/td/a')
for c in counties:
item = response.request.meta["item"]
county = c.xpath('text()').extract()[0]
countyUrl = c.xpath('@href').extract()[0]
url = item["stateUrl"] + countyUrl
item["county"]=county
item["countyUrl"]=url
yield Request(url=url, callback=self.parse_agencies,meta={'item':item})
def parse_agencies(self,response):
sel = Selector(response)
agencies = sel.xpath('//table[9]/tr/td/table[2]/tr/td/font/a | //table[9]/tr/td/table[2]/tr/td/a')
for a in agencies:
item = response.request.meta["item"]
agency = a.xpath('text()').extract()[0]
agencyUrl = a.xpath('@href').extract()[0]
url = item["stateUrl"] + agencyUrl
item["agency"] = agency
item["agencyUrl"] = url
yield Request(url=url, callback=self.parse_agencyinfo,meta={'item':item})
def parse_agencyinfo(self,response):
sel = Selector(response)
item = response.request.meta["item"]
item["agency"]= ' '.join(sel.xpath('//comment()[.=" Begin center section "]/following::table/tr/td/strong/font[1]/text()').extract())
item["admintype"]= ' '.join(sel.xpath('//comment()[.=" Begin center section "]/following::table/tr/td/strong/font[2]/text()').extract())
item["adminhead"]= ' '.join(sel.xpath('//comment()[.=" Begin center section "]/following::table/tr/td/strong/font[3]/text()[1]').extract())
item["address"]= ' '.join(sel.xpath('//comment()[.=" Begin center section "]/following::table/tr/td/strong/font[3]/text()[position()>1]').extract())
return item
Hey so the problem is every time you assign item = response.request.meta["item"]
your referencing and assigning the same item over and over again.
Fortunately its an easy fix! Just wrap response.request.meta["item"]
with AgencyItem(response.request.meta["item"])
to create a copy of the state item for each county.
Also don't forget to do the same in other callbacks or else you'll have the problem with other fields. Hope that helps!