ERROR: Error caught on signal handler: bound method OffsiteMiddleware.request_scheduled of scrapy.downloadermiddlewares.offsite.OffsiteMiddleware object at 0x000002C3EBB5DB50 While I'm developing script for crawling with scrapy and selenium, I got above error. how can I fix it?
# -*- coding: utf-8 -*-
import scrapy
from scrapy import FormRequest
from scrapy.http import HtmlResponse
from datetime import datetime, timedelta
from bloomberg.items import BloombergItem
import json
from scrapy.shell import inspect_response
import re
from selenium import webdriver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
class HistoricalDataSpider(scrapy.Spider):
name = 'historical_data'
# allowed_domains = ['econcal.forexprostools.com']
start_urls = ['http://econcal.forexprostools.com/']
output = []
not_parsed_pages = 0
def start_requests(self):
chrome_options = Options()
# chrome_options.add_argument('--headless')
# Setting up the Chrome WebDriver with options
driver = webdriver.Chrome(options=chrome_options)
# Replace '' with the URL you want to scrape
driver.get('http://econcal.forexprostools.com/')
page_source = driver.page_source
for n in range(0, (self.end_date-self.start_date).days+1, 30):
start_date = self.start_date + timedelta(n)
end_date = self.start_date + timedelta(n+30)
if end_date > self.end_date: end_date = self.end_date
skip = False
for n, date in enumerate(self.scraped_dates):
if start_date <= date <= end_date and (self.end_date - date).days > 90:
skip = True
self.scraped_dates = self.scraped_dates[n:]
break
if skip:
continue
start_date = start_date.strftime('%Y-%m-%d')
end_date = end_date.strftime('%Y-%m-%d')
html_response = HtmlResponse(url=driver.current_url, body=page_source, encoding='utf-8')
rows = html_response.css('button')
# output = []
for row in rows:
# print(row, 'init###')
if 'USD' in row.css('div::text').extract_first():
event_datetime = row.css('button::attr(event_timestamp)').extract_first()
event_datetime = datetime.strptime(event_datetime, '%Y-%m-%d %H:%M:%S')
date = event_datetime.strftime('%m/%d/%Y')
time = event_datetime.strftime('%H:%M')
event_name = row.css('.left.event::text').extract_first().strip()
actual = row.css('.act span::text').extract()
if actual:
actual = actual[1].strip()
if actual:
actual = re.sub('\,', '', actual)
actual = re.search('[-0-9.]+', actual).group()
else: actual = None
forecast = row.css('.fore span::text').extract()
if forecast:
forecast = forecast[1].strip()
if forecast:
forecast = re.sub('\,', '', forecast)
forecast = re.search('[-0-9.]+', forecast).group()
else: forecast = None
prev = row.css('.prev span::text').extract()
if prev:
prev = prev[1].strip()
if prev:
prev = re.sub('\,', '', prev)
prev = re.search('[-0-9.]+', prev).group()
else: prev = None
new_row = [date,time, event_name, actual, forecast, prev]
if new_row not in self.output:
self.output.append([date,time, event_name, actual, forecast, prev])
# self.not_parsed_pages -= 1
if self.not_parsed_pages == 0:
item = BloombergItem()
item['data'] = self.output
yield item
driver.quit()
setting.py
# -*- coding: utf-8 -*-
# Scrapy settings for bloomberg project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'bloomberg'
SPIDER_MODULES = ['bloomberg.spiders']
NEWSPIDER_MODULE = 'bloomberg.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
SPIDER_MIDDLEWARES = {
'bloomberg.middlewares.BloombergSpiderMiddleware': 543,\
}
ITEM_PIPELINES = {
'bloomberg.pipelines.BloombergPipeline': 300,
}
# Filename with .csv
HISTORY_OUTPUT_FILENAME = 'bloomberg_history.csv'
LOG_LEVEL="DEBUG"
# Quandl api key
QUANDL_API_KEY = "X4hf1sbHT6D3xgN6kz7N"
# VX MASTER file path with filename (example/example.csv) vix futures curve
VX_MASTER_FILE_PATH = r"C:/Users/Prisma/Desktop/AdditionalData/VXF.csv"
#Treasury master file path with filename(example/example.csv)
TRES_MASTER_FILE_PATH = r"C:/Users/Prisma/Desktop/AdditionalData/TSY.csv"
EVENTS_DIRECTORY = r"C:/Users/Prisma/Desktop/AdditionalData/"
I'm going to combinate with selenium and scrapy. but I don't know well about calling process_item in spider.
Don't start selenium in start_request, the url should be yeild into middlewares than can be parse in spiders.
Just add an custom download middle us selenium, for starter, can ref this repo.
At last, don't forget add your custom middleware in settings.