I have about 100 spiders on a server. Every morning all spiders start scraping and writing all of the logs in their logs. Sometimes a couple of them gives me an error. When a spider gives me an error I have to go to the server and read from log file but I want to read the logs from the mail.
I already set dynamic mail sender as follow:
class FirstBotSpiderMiddleware:
def __init__(self, stats):
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
s = cls(crawler.stats)
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
return s
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
pass
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
def spider_closed(self, spider,reason):
error_count = self.stats.get_value('log_count/ERROR')
counts = self.stats.get_value('item_scraped_count')
count_403 = self.stats.get_value('downloader/response_status_count/403')
count_404 = self.stats.get_value('downloader/response_status_count/404')
robots_404 = self.stats.get_value('robotstxt/response_status_count/404')
robots_403 = self.stats.get_value('robotstxt/response_status_count/403')
duplicate_count = self.stats.get_value('item_dropped_count')
#I want to read all logs here
content = "some stat string"
self.mailSender(spider.name,content,logs)
def mailSender(self,spider,content,logs):
send_mail(
"Scrapy "+spider+" done",
content,
djsettings.EMAIL_HOST_USER,
['xxx@xxx.com'],
)
I couldn't figure out how to read the error log at spider_closed on middleware dynamically. Do you have any suggestions?
I have implemented a similar method in my web scraping module.
Below is the implementation you can look at and take reference from.
import gzip
import datetime
from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder
from collections import defaultdict
try:
from cStringIO import cStringIO as StringIO
except ImportError:
from io import StringIO
def format_size(size):
for x in ['bytes', 'KB', 'MB', 'GB']:
if size < 1024.0:
return "%3.1f %s" % (size, x)
size /= 1024.0
class GzipCompressor(gzip.GzipFile):
extension = '.gz'
mimetype = 'application/gzip'
def __init__(self):
super(GzipCompressor, self).__init__(
fileobj=PlainCompressor(), mode='w')
self.read = self.fileobj.read
class PlainCompressor(StringIO):
extension = ''
mimetype = 'text/plain'
def read(self, *args, **kwargs):
self.seek(0)
return StringIO.read(self, *args, **kwargs)
@property
def size(self):
return len(self.getvalue())
class StatusMailer(object):
def __init__(self, recipients, mail, compressor, crawler):
self.recipients = recipients
self.mail = mail
self.encoder = ScrapyJSONEncoder()
self.files = defaultdict(compressor)
self.num_items = 0
self.num_errors = 0
self.start_time = datetime.datetime.now()
@classmethod
def from_crawler(cls, crawler):
recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
compression = crawler.settings.get('STATUSMAILER_COMPRESSION')
if not compression:
compressor = PlainCompressor
elif compression.lower().startswith('gz'):
compressor = GzipCompressor
else:
raise NotConfigured
if not recipients:
raise NotConfigured
mail = MailSender.from_settings(crawler.settings)
instance = cls(recipients, mail, compressor, crawler)
crawler.signals.connect(instance.item_scraped,
signal=signals.item_scraped)
crawler.signals.connect(instance.spider_error,
signal=signals.spider_error)
crawler.signals.connect(instance.spider_closed,
signal=signals.spider_closed)
return instance
def item_scraped(self, item, response, spider):
self.num_items += 1
self.files[spider.name + '.log'].write(str(self.num_items)+ " " + str(response.url) + '\n')
self.files[spider.name +
'-items.json'].write(self.encoder.encode(item))
def spider_error(self, failure, response, spider):
self.files[spider.name + '.log'].write(failure.getTraceback())
self.num_errors += 1
def spider_closed(self, spider, reason):
files = []
for name, compressed in self.files.items():
files.append((name + compressed.extension,
compressed.mimetype, compressed))
try:
size = self.files[spider.name + '-items.json'].size
except KeyError:
size = 0
body = '''Crawl statistics:
- Spider name: {0}
- Spider started at: {1}
- Spider finished at: {2}
- Number of items scraped: {3}
- Number of errors: {4}
- Size of scraped items: {5}'''.format(
spider.name,
self.start_time,
datetime.datetime.now(),
self.num_items,
self.num_errors,
format_size(size)
)
return self.mail.send(
to=self.recipients,
subject='Crawler for %s: %s' % (spider.name, reason),
body=body,
attachs=files
)