I have adapted the code from Using Middleware to ignore duplicates in Scrapy.
from scrapy.exceptions import DropItem
from scrapy import log
import os.path
class IgnoreDuplicates():
def __init__(self):
self._cu_file = open("crawled_urls.txt", "a+")
self._crawled_urls = set([line.strip() for line in self._cu_file.readlines()])
def process_request(self, request, spider):
if request.url in self._crawled_urls:
raise DropItem("Duplicate product scrape caught by IgnoreDuplicates at <%s>" % (url))
else:
self._crawled_urls.add(request.url)
self._cu_file.write(request.url + '\n')
log.msg("IgnoreDuplicates recorded this url " + request.url, level=log.DEBUG)
return None
I have also added the middleware module to the settings.py:
SPIDER_MANAGER_CLASS = 'slybot.spidermanager.SlybotSpiderManager'
EXTENSIONS = {'slybot.closespider.SlybotCloseSpider': 1}
ITEM_PIPELINES = {'slybot.dupefilter.DupeFilterPipeline': 1}
SPIDER_MIDDLEWARES = {'slybot.middleware.IgnoreDuplicates': 500, 'slybot.spiderlets.SpiderletsMiddleware': 999} # as close as possible to spider output
PLUGINS = ['slybot.plugins.scrapely_annotations.Annotations']
SLYDUPEFILTER_ENABLED = True
PROJECT_DIR = 'slybot-project'
FEED_EXPORTERS = {
'csv': 'slybot.exporter.SlybotCSVItemExporter',
}
CSV_EXPORT_FIELDS = None
try:
from local_slybot_settings import *
except ImportError:
pass
The process_request function does not get called. I've tried changing the value for the middleware key in settings.py so it is executed before and after the SpiderletsMiddleware. But the exception and the log message do not show up in the output.
How do I make sure the middleware is called?
The callback function is different for spider middleware. I used the code from this snippet as a reference: http://snipplr.com/view/67018/middleware-to-avoid-revisiting-already-visited-items/
Here is a working version of the middleware code I posted in the question.
from scrapy.http import Request
from scrapy import log
import os.path
class IgnoreVisitedItems(object):
def __init__(self):
# Load the URLs that have already been crawled
self._cu_file = open("crawled_urls.txt", "a+")
self._crawled_urls = set([line.strip() for line in self._cu_file.readlines()])
def process_spider_output(self, response, result, spider):
ret = []
for x in result:
# Find the URL in the result or response
url = None
if isinstance(x, Request):
url = x.url
else:
url = response.request.url
# Check if the URL has been crawled, and add
# it to the list of crawled URLs.
if url in self._crawled_urls:
log.msg("Ignoring already visited: %s" % url,
level=log.INFO, spider=spider)
else:
log.msg("Adding %s to list of visited urls" % url,
level=log.INFO, spider=spider)
self._cu_file.write(url + '\n')
self._crawled_urls.add(url)
ret.append(x)
return ret