Search code examples
pythonscrapyscrapy-middleware

How can I read all logs at middleware?


I have about 100 spiders on a server. Every morning all spiders start scraping and writing all of the logs in their logs. Sometimes a couple of them gives me an error. When a spider gives me an error I have to go to the server and read from log file but I want to read the logs from the mail.

I already set dynamic mail sender as follow:

class FirstBotSpiderMiddleware:
    def __init__(self, stats):
        self.stats = stats

    @classmethod
    def from_crawler(cls, crawler):
        s = cls(crawler.stats)
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
        return s

    def process_spider_input(self, response, spider):
        return None

    def process_spider_output(self, response, result, spider):
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        pass

    def process_start_requests(self, start_requests, spider):
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
    
    def spider_closed(self, spider,reason):
        error_count = self.stats.get_value('log_count/ERROR')
        counts = self.stats.get_value('item_scraped_count')
        count_403 = self.stats.get_value('downloader/response_status_count/403')
        count_404 = self.stats.get_value('downloader/response_status_count/404')
        robots_404 = self.stats.get_value('robotstxt/response_status_count/404')
        robots_403 = self.stats.get_value('robotstxt/response_status_count/403')
        duplicate_count = self.stats.get_value('item_dropped_count')

        #I want to read all logs here

        content = "some stat string"

        self.mailSender(spider.name,content,logs)

    def mailSender(self,spider,content,logs):
        send_mail(
        "Scrapy "+spider+" done",
        content,
        djsettings.EMAIL_HOST_USER,
        ['[email protected]'],
        )

I couldn't figure out how to read the error log at spider_closed on middleware dynamically. Do you have any suggestions?


Solution

  • I have implemented a similar method in my web scraping module.

    Below is the implementation you can look at and take reference from.

    import gzip
    import datetime
    
    from scrapy import signals
    from scrapy.mail import MailSender
    from scrapy.exceptions import NotConfigured
    from scrapy.utils.serialize import ScrapyJSONEncoder
    
    from collections import defaultdict
    
    try:
        from cStringIO import cStringIO as StringIO
    except ImportError:
        from io import StringIO
     
    
    def format_size(size):
        for x in ['bytes', 'KB', 'MB', 'GB']:
            if size < 1024.0:
                return "%3.1f %s" % (size, x)
    
            size /= 1024.0
    
    
    class GzipCompressor(gzip.GzipFile):
        extension = '.gz'
        mimetype = 'application/gzip'
    
        def __init__(self):
            super(GzipCompressor, self).__init__(
                fileobj=PlainCompressor(), mode='w')
            self.read = self.fileobj.read
    
    
    class PlainCompressor(StringIO):
        extension = ''
        mimetype = 'text/plain'
    
        def read(self, *args, **kwargs):
            self.seek(0)
    
            return StringIO.read(self, *args, **kwargs)
    
        @property
        def size(self):
            return len(self.getvalue())
    
    
    class StatusMailer(object):
        def __init__(self, recipients, mail, compressor, crawler):
            self.recipients = recipients
            self.mail = mail
            self.encoder = ScrapyJSONEncoder()
            self.files = defaultdict(compressor)
    
            self.num_items = 0
            self.num_errors = 0
            self.start_time = datetime.datetime.now()
    
        @classmethod
        def from_crawler(cls, crawler):
            recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
            compression = crawler.settings.get('STATUSMAILER_COMPRESSION')
    
            if not compression:
                compressor = PlainCompressor
            elif compression.lower().startswith('gz'):
                compressor = GzipCompressor
            else:
                raise NotConfigured
    
            if not recipients:
                raise NotConfigured
    
            mail = MailSender.from_settings(crawler.settings)
            instance = cls(recipients, mail, compressor, crawler)
    
            crawler.signals.connect(instance.item_scraped,
                                    signal=signals.item_scraped)
            crawler.signals.connect(instance.spider_error,
                                    signal=signals.spider_error)
            crawler.signals.connect(instance.spider_closed,
                                    signal=signals.spider_closed)
    
            return instance
    
        def item_scraped(self, item, response, spider):
            self.num_items += 1
            self.files[spider.name + '.log'].write(str(self.num_items)+ " " + str(response.url) + '\n')
            self.files[spider.name +
                       '-items.json'].write(self.encoder.encode(item))
    
        def spider_error(self, failure, response, spider):
            self.files[spider.name + '.log'].write(failure.getTraceback())
            self.num_errors += 1
    
        def spider_closed(self, spider, reason):
            files = []
            for name, compressed in self.files.items():
                files.append((name + compressed.extension,
                              compressed.mimetype, compressed))
    
            try:
                size = self.files[spider.name + '-items.json'].size
            except KeyError:
                size = 0
    
            body = '''Crawl statistics:
    
                - Spider name: {0}
                - Spider started at: {1}
                - Spider finished at: {2}
                - Number of items scraped: {3}
                - Number of errors: {4}
                - Size of scraped items: {5}'''.format(
                spider.name,
                self.start_time,
                datetime.datetime.now(),
                self.num_items,
                self.num_errors,
                format_size(size)
            )
    
            return self.mail.send(
                to=self.recipients,
                subject='Crawler for %s: %s' % (spider.name, reason),
                body=body,
                attachs=files
            )