When I run the code below, I end up with a file that has all the expected data from the second code block but nothing from the first. In other words, all of the data from EventLocation to EventURL is present but nothing from EventArtist to EventDetails. What do I need to modify to get this working correctly?
import urlparse
from scrapy.http import Request
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
#from NT.items import NowTorontoItem
from scrapy.item import Item, Field
class NowTorontoItem(Item):
eventArtist = Field()
eventTitle = Field()
eventHolder = Field()
eventDetails = Field()
eventLocation = Field()
eventOrganization = Field()
eventName = Field()
eventAddress = Field()
eventLocality = Field()
eventPostalCode = Field()
eventPhone = Field()
eventURL = Field()
class MySpider(BaseSpider):
name = "NTSpider"
allowed_domains = ["nowtoronto.com"]
start_urls = ["http://www.nowtoronto.com/music/listings/"]
def parse(self, response):
selector = Selector(response)
listings = selector.css("div.listing-item0, div.listing-item1")
for listing in listings:
item = NowTorontoItem()
for body in listing.css('span.listing-body > div.List-Body'):
item ["eventArtist"] = body.css("span.List-Name::text").extract()
item ["eventTitle"] = body.css("span.List-Body-Emphasis::text").extract()
item ["eventHolder"] = body.css("span.List-Body-Strong::text").extract()
item ["eventDetails"] = body.css("::text").extract()
# yield a Request()
# so that scrapy enqueues a new page to fetch
detail_url = listing.css("div.listing-readmore > a::attr(href)")
if detail_url:
yield Request(urlparse.urljoin(response.url,
detail_url.extract()[0]),
callback=self.parse_details)
def parse_details(self, response):
self.log("parse_details: %r" % response.url)
selector = Selector(response)
listings = selector.css("div.whenwhereContent")
for listing in listings:
for body in listing.css('td.small-txt.dkgrey-txt.rightInfoTD'):
item = NowTorontoItem()
item ["eventLocation"] = body.css("span[property='v:location']::text").extract()
item ["eventOrganization"] = body.css("span[property='v:organization'] span[property='v:name']::text").extract()
item ["eventName"] = body.css("span[property='v:name']::text").extract()
item ["eventAddress"] = body.css("span[property='v:street-address']::text").extract()
item ["eventLocality"] = body.css("span[property='v:locality']::text").extract()
item ["eventPostalCode"] = body.css("span[property='v:postal-code']::text").extract()
item ["eventPhone"] = body.css("span[property='v:tel']::text").extract()
item ["eventURL"] = body.css("span[property='v:url']::text").extract()
yield item
Edit
It now appears to be running but there is a small problem. For each event, it returns either two rows, one with all of the details and one with only the details pulled from the first code block or three rows, one with all of the details and two identical rows with only the details pulled from the first block.
Here is an example of the first situation
2014-03-21 11:12:40-0400 [NTSpider] DEBUG: parse_details: 'http://www.nowtoronto.com/music/listings/listing.cfm?listingid=129761&subsection=&category=&criticspicks=&date1=&date2=&locationId=0'
2014-03-21 11:12:40-0400 [NTSpider] DEBUG: Scraped from <200 http://www.nowtoronto.com/music/listings/listing.cfm?listingid=129761&subsection=&category=&criticspicks=&date1=&date2=&locationId=0>
{'eventAddress': [u'875 Bloor W'],
'eventArtist': [u'Andria Simone & Those Guys'],
'eventDetails': [u'Andria Simone & Those Guys',
u' (pop/soul) ',
u'Baltic Avenue',
u' 8 pm, $15.'],
'eventHolder': [u'Baltic Avenue'],
'eventLocality': [u'Toronto'],
'eventLocation': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t'],
'eventName': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tBaltic Avenue'],
'eventOrganization': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tBaltic Avenue'],
'eventPhone': [u'647-898-5324'],
'eventPostalCode': [u'M6G 3T6'],
'eventTitle': [],
'eventURL': []}
2014-03-21 11:12:40-0400 [NTSpider] DEBUG: Scraped from <200 http://www.nowtoronto.com/music/listings/listing.cfm?listingid=129761&subsection=&category=&criticspicks=&date1=&date2=&locationId=0>
{'eventAddress': [],
'eventArtist': [u'Andria Simone & Those Guys'],
'eventDetails': [u'Andria Simone & Those Guys',
u' (pop/soul) ',
u'Baltic Avenue',
u' 8 pm, $15.'],
'eventHolder': [u'Baltic Avenue'],
'eventLocality': [],
'eventLocation': [],
'eventName': [],
'eventOrganization': [],
'eventPhone': [],
'eventPostalCode': [],
'eventTitle': [],
'eventURL': []}
And here is an example of the second situation
2014-03-21 11:21:23-0400 [NTSpider] DEBUG: parse_details: 'http://www.nowtoronto.com/music/listings/listing.cfm?listingid=130096&subsection=&category=&criticspicks=&date1=&date2=&locationId=0'
2014-03-21 11:21:23-0400 [NTSpider] DEBUG: Scraped from <200 http://www.nowtoronto.com/music/listings/listing.cfm?listingid=130096&subsection=&category=&criticspicks=&date1=&date2=&locationId=0>
{'eventAddress': [u'11 Polson'],
'eventArtist': [u'Danny Byrd, S.P.Y., Fred V & Grafix, Marcus Visionary, Lushy '],
'eventDetails': [u'Danny Byrd, S.P.Y., Fred V & Grafix, Marcus Visionary, Lushy ',
u'Bassweek: Projek-Hospitality ',
u'Sound Academy',
u' $35 or wristband TM.'],
'eventHolder': [u'Sound Academy'],
'eventLocality': [u'Toronto'],
'eventLocation': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t'],
'eventName': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tSound Academy'],
'eventOrganization': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tSound Academy'],
'eventPhone': [u'416-461-3625'],
'eventPostalCode': [u'M5A 1A4'],
'eventTitle': [u'Bassweek: Projek-Hospitality '],
'eventURL': [u'sound-academy.com']}
2014-03-21 11:21:23-0400 [NTSpider] DEBUG: Crawled (200) <GET http://www.nowtoronto.com/music/listings/listing.cfm?listingid=122291&subsection=&category=&criticspicks=&date1=&date2=&locationId=0> (referer: http://www.nowtoronto.com/music/listings/)
2014-03-21 11:21:24-0400 [NTSpider] DEBUG: Scraped from <200 http://www.nowtoronto.com/music/listings/listing.cfm?listingid=130096&subsection=&category=&criticspicks=&date1=&date2=&locationId=0>
{'eventAddress': [],
'eventArtist': [u'Danny Byrd, S.P.Y., Fred V & Grafix, Marcus Visionary, Lushy '],
'eventDetails': [u'Danny Byrd, S.P.Y., Fred V & Grafix, Marcus Visionary, Lushy ',
u'Bassweek: Projek-Hospitality ',
u'Sound Academy',
u' $35 or wristband TM.'],
'eventHolder': [u'Sound Academy'],
'eventLocality': [],
'eventLocation': [],
'eventName': [],
'eventOrganization': [],
'eventPhone': [],
'eventPostalCode': [],
'eventTitle': [u'Bassweek: Projek-Hospitality '],
'eventURL': []}
2014-03-21 11:21:24-0400 [NTSpider] DEBUG: Scraped from <200 http://www.nowtoronto.com/music/listings/listing.cfm?listingid=130096&subsection=&category=&criticspicks=&date1=&date2=&locationId=0>
{'eventAddress': [],
'eventArtist': [u'Danny Byrd, S.P.Y., Fred V & Grafix, Marcus Visionary, Lushy '],
'eventDetails': [u'Danny Byrd, S.P.Y., Fred V & Grafix, Marcus Visionary, Lushy ',
u'Bassweek: Projek-Hospitality ',
u'Sound Academy',
u' $35 or wristband TM.'],
'eventHolder': [u'Sound Academy'],
'eventLocality': [],
'eventLocation': [],
'eventName': [],
'eventOrganization': [],
'eventPhone': [],
'eventPostalCode': [],
'eventTitle': [u'Bassweek: Projek-Hospitality '],
'eventURL': []}
You should pass your item from parse()
to parse_details()
in Request
's meta argument:
yield Request(urlparse.urljoin(response.url,
detail_url.extract()[0]),
meta={'item': item},
callback=self.parse_details)
Then, in parse_details()
you can get the item from response.meta['item']
(docs).
Also, you probably want to yield
an item if no details found:
if detail_url:
yield Request(urlparse.urljoin(response.url,
detail_url.extract()[0]),
meta={'item': item},
callback=self.parse_details)
else:
yield item
Hope that helps.