Search code examples
selenium-webdriverweb-scrapingerror-handlingscrapy

Selenium and Scrapy Combination


ERROR: Error caught on signal handler: bound method OffsiteMiddleware.request_scheduled of scrapy.downloadermiddlewares.offsite.OffsiteMiddleware object at 0x000002C3EBB5DB50 While I'm developing script for crawling with scrapy and selenium, I got above error. how can I fix it?

# -*- coding: utf-8 -*-
import scrapy
from scrapy import FormRequest
from scrapy.http import HtmlResponse
from datetime import datetime, timedelta
from bloomberg.items import BloombergItem
import json
from scrapy.shell import inspect_response
import re
from selenium import webdriver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

class HistoricalDataSpider(scrapy.Spider):
    name = 'historical_data'
    # allowed_domains = ['econcal.forexprostools.com']
    start_urls = ['http://econcal.forexprostools.com/']
    output = []
    not_parsed_pages = 0
    def start_requests(self):
        chrome_options = Options()
        # chrome_options.add_argument('--headless')

        # Setting up the Chrome WebDriver with options
        driver = webdriver.Chrome(options=chrome_options)
        
        # Replace '' with the URL you want to scrape
        driver.get('http://econcal.forexprostools.com/')
        page_source = driver.page_source
        for n in range(0, (self.end_date-self.start_date).days+1, 30):
            start_date = self.start_date + timedelta(n)
            end_date = self.start_date + timedelta(n+30)
            if end_date > self.end_date: end_date = self.end_date
            skip = False
            for n, date in enumerate(self.scraped_dates):
                if start_date <= date <= end_date and (self.end_date - date).days > 90:
                    skip = True
                    self.scraped_dates = self.scraped_dates[n:]
                    break
            if skip:
                continue
            start_date = start_date.strftime('%Y-%m-%d')
            end_date = end_date.strftime('%Y-%m-%d')
        html_response = HtmlResponse(url=driver.current_url, body=page_source, encoding='utf-8')
        
        rows = html_response.css('button')
        
        # output = []
        for row in rows:
            # print(row, 'init###')
            if 'USD' in row.css('div::text').extract_first():

                event_datetime = row.css('button::attr(event_timestamp)').extract_first()
                event_datetime = datetime.strptime(event_datetime, '%Y-%m-%d %H:%M:%S')
                date = event_datetime.strftime('%m/%d/%Y')
                time = event_datetime.strftime('%H:%M')
                event_name = row.css('.left.event::text').extract_first().strip()
                actual = row.css('.act span::text').extract()
                if actual: 
                    actual = actual[1].strip()
                    if actual:
                        actual = re.sub('\,', '', actual)
                        actual = re.search('[-0-9.]+', actual).group()
                else: actual = None                
                forecast = row.css('.fore span::text').extract()
                if forecast: 
                    forecast = forecast[1].strip()
                    if forecast:
                        forecast = re.sub('\,', '', forecast)
                        forecast = re.search('[-0-9.]+', forecast).group()
                else: forecast = None
                prev = row.css('.prev span::text').extract()
                if prev: 
                    prev = prev[1].strip()
                    if prev:
                        prev = re.sub('\,', '', prev)
                        prev = re.search('[-0-9.]+', prev).group()
                else: prev = None
                new_row = [date,time, event_name, actual, forecast, prev]
                if new_row not in self.output:
                    self.output.append([date,time, event_name, actual, forecast, prev])
        # self.not_parsed_pages -= 1
        if self.not_parsed_pages == 0:
            item = BloombergItem()
            item['data'] = self.output
            yield item
        driver.quit()

setting.py


# -*- coding: utf-8 -*-

# Scrapy settings for bloomberg project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'bloomberg'

SPIDER_MODULES = ['bloomberg.spiders']
NEWSPIDER_MODULE = 'bloomberg.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False


SPIDER_MIDDLEWARES = {
   'bloomberg.middlewares.BloombergSpiderMiddleware': 543,\
}

ITEM_PIPELINES = {
   'bloomberg.pipelines.BloombergPipeline': 300,
}

# Filename with .csv
HISTORY_OUTPUT_FILENAME = 'bloomberg_history.csv'

LOG_LEVEL="DEBUG"

# Quandl api key
QUANDL_API_KEY = "X4hf1sbHT6D3xgN6kz7N"

# VX MASTER file path with filename (example/example.csv) vix futures curve
VX_MASTER_FILE_PATH = r"C:/Users/Prisma/Desktop/AdditionalData/VXF.csv"

#Treasury master file path with filename(example/example.csv)
TRES_MASTER_FILE_PATH = r"C:/Users/Prisma/Desktop/AdditionalData/TSY.csv"

EVENTS_DIRECTORY = r"C:/Users/Prisma/Desktop/AdditionalData/"

I'm going to combinate with selenium and scrapy. but I don't know well about calling process_item in spider.


Solution

  • Don't start selenium in start_request, the url should be yeild into middlewares than can be parse in spiders.

    Just add an custom download middle us selenium, for starter, can ref this repo.

    At last, don't forget add your custom middleware in settings.