My items.py looks like:
import scrapy
class NewsItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
pubDate = scrapy.Field()
description = scrapy.Field()
image_link = scrapy.Field()
pass
And a spider used:
class Spider(BaseSpider):
NO_IMAGE = "NoImageFound"
name = '****'
allowed_domains = ['****', '****']
start_urls = [
'http://*****',
]
def parse(self, response):
self.log('A response from %s just arrived!' % response.url)
sel = Selector(response)
_items = sel.xpath('//item')
for item in _items:
_item = NewsItem()
_title = item.xpath('title/text()').extract()
_item['title'] = ""
if _title:
_item['title'] = _title[0]
#other stuffs here
yield Request(_item['link'], callback=self.parse_detail, meta={'_item': _item})
yield _item
def parse_detail(self, response):
_item = response.meta.get('_item')
sel = Selector(response)
try:
_item['image_link'] = sel.select("//div[@class='article_content']/*/img/@src").extract()[0]
except:
try:
_item['image_link'] = sel.select("//div[@class='entry']/descendant::node()/img/@src").extract()[0]
except:
_item['image_link'] = self.NO_IMAGE
if _item['image_link'][0].endswith('gif'):
_item = self.NO_IMAGE
# _item['image_link'] = "TESTING"
return _item
And in the pipelines.py
class NewsUploadPipeline(object):
def process_item(self, item, spider):
title = item['title'].encode('ascii', 'ignore')
description = item['description'].encode('ascii', 'ignore')
link = item['link'].encode('ascii', 'ignore')
image_link = item['image_link'].encode('ascii', 'ignore')
When I run the project, I get this:
File "/home/khadka/rkbnb/my_app/crawler/rkbnbcrawler/rkbnbcrawler/pipelines.py", line 16, in process_item
image_link = item['image_link'].encode('ascii', 'ignore')
File "/home/khadka/rkbnb/my_app/lib/python2.7/site-packages/scrapy/item.py", line 56, in __getitem__
return self._values[key]
KeyError: 'image_link'
Output reports
'log_count/DEBUG': 57,
'log_count/ERROR': 27,
'log_count/INFO': 7,
'log_count/WARNING': 2,
What is wrong? Clearly image_link exists in items.py. Any help or just hint is heartly appreciated.
I found the problem
The problem is the addition of item['image_link']
is not being passed back to parse
function.
Solved
I solved this using
_item = Request(_item['link'], callback=self.parse_detail, meta={'_item': _item})
yield _item
I guess that returned the changes to _item from parse_detail
function.