I have the following custom pipeline for downloading JSON files. It was functioning fine until I need to add the __init__
function, in which I subclass the FilesPipeline
class in order to add a few new properties. The pipeline takes URLs that are to API endpoints and downloads their responses. The folders are properly created when running the spider via scrapy crawl myspider
and the two print statements in the file_path
function show the correct values (filename and filepath). However, the files are never actually downloaded.
I did find a few similar questions about custom file pipelines and files not downloading (here (the solution was they needed to yield the items instead of returning them) and here (the solution was needing to adjust the ROBOTSTXT_OBEY setting) for example), but the solutions did not work for me.
What am I doing wrong (or forgetting to do when subclassing the FilesPipeline
)? I've been racking my brain over this issue for a good 3 hours and my google-fu has not yielded any resolutions for my case.
class LocalJsonFilesPipeline(FilesPipeline):
FILES_STORE = "json_src"
FILES_URLS_FIELD = "json_url"
FILES_RESULT_FIELD = "local_json"
def __init__(self, store_uri, use_response_url=False, filename_regex=None, settings=None):
# super(LocalJsonFilesPipeline, self).__init__(store_uri)
self.store_uri = store_uri
self.use_response_url = use_response_url
if filename_regex:
self.filename_regex = re.compile(filename_regex)
else:
self.filename_regex = filename_regex
super(LocalJsonFilesPipeline, self).__init__(store_uri, settings=settings)
@classmethod
def from_crawler(cls, crawler):
if not crawler.spider:
return BasePipeline()
store_uri = f'{cls.FILES_STORE}/{crawler.spider.name}'
settings = crawler.spider.settings
use_response_url = settings.get('JSON_FILENAME_USE_RESPONSE_URL', False)
filename_regex = settings.get('JSON_FILENAME_REGEX')
return cls(store_uri, use_response_url, filename_regex, settings)
def parse_path(self, value):
if self.filename_regex:
try:
return self.filename_regex.findall(value)[0]
except IndexError:
pass
# fallback method in the event no regex is provided by the spider
# example: /p/russet-potatoes-5lb-bag-good-38-gather-8482/-/A-77775602
link_path = os.path.splitext(urlparse(value).path)[0] # omit extension if there is one
link_params = link_path.rsplit('/', 1)[1] # preserve the last portion separated by forward-slash (A-77775602)
return link_params if '=' not in link_params else link_params.split('=', 1)[1]
def get_media_requests(self, item, info):
json_url = item.get(self.FILES_URLS_FIELD)
if json_url:
filename_url = json_url if not self.use_response_url else item.get('url', '')
return [Request(json_url, meta={'filename': self.parse_path(filename_url), 'spider': info.spider.name})]
def file_path(self, request, response=None, info=None):
final_path = f'{self.FILES_STORE}/{request.meta["spider"]}/{request.meta["filename"]}.json'
print('url', request.url)
print('downloading to', final_path)
return final_path
And the custom settings of my spider
class MockSpider(scrapy.Spider):
name = 'mock'
custom_settings = {
'ITEM_PIPELINES': {
'mock.pipelines.LocalJsonFilesPipeline': 200
},
'JSON_FILENAME_REGEX': r'products\/(.+?)\/ProductInfo\+ProductDetails'
}
Log with the level set to debug
C:\Users\Mike\Desktop\scrapy_test\pipeline_test>scrapy crawl testsite
2020-07-19 11:23:08 [scrapy.utils.log] INFO: Scrapy 2.2.1 started (bot: pipeline
_test)
2020-07-19 11:23:08 [scrapy.utils.log] INFO: Versions: lxml 4.2.5.0, libxml2 2.9
.5, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.7.6 (
tags/v3.7.6:43364a7ae0, Dec 19 2019, 00:42:30) [MSC v.1916 64 bit (AMD64)], pyOp
enSSL 19.0.0 (OpenSSL 1.1.0i 14 Aug 2018), cryptography 2.3.1, Platform Windows
-7-6.1.7601-SP1
2020-07-19 11:23:08 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.se
lectreactor.SelectReactor
2020-07-19 11:23:08 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'pipeline_test',
'LOG_STDOUT': True,
'NEWSPIDER_MODULE': 'pipeline_test.spiders',
'ROBOTSTXT_OBEY': True,
'SPIDER_MODULES': ['pipeline_test.spiders']}
2020-07-19 11:23:08 [scrapy.extensions.telnet] INFO: Telnet Password: 0454b083df
d2028a
2020-07-19 11:23:08 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-07-19 11:23:08 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-07-19 11:23:08 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-07-19 11:23:08 [scrapy.middleware] INFO: Enabled item pipelines:
['pipeline_test.pipelines.LocalJsonFilesPipeline']
2020-07-19 11:23:08 [scrapy.core.engine] INFO: Spider opened
2020-07-19 11:23:08 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pag
es/min), scraped 0 items (at 0 items/min)
2020-07-19 11:23:08 [scrapy.extensions.telnet] INFO: Telnet console listening on
127.0.0.1:6023
2020-07-19 11:23:08 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.[testsite].com/robots.txt> (referer: None)
2020-07-19 11:23:08 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://[testsite]/vpd/v1/products/prod6149174-product/ProductInfo+ProductDetails> (re
ferer: None)
2020-07-19 11:23:08 [stdout] INFO: url
2020-07-19 11:23:08 [stdout] INFO: https://[testsite]/vpd/v1/products/pro
d6149174-product/ProductInfo+ProductDetails
2020-07-19 11:23:08 [stdout] INFO: downloading to
2020-07-19 11:23:08 [stdout] INFO: json_src/[testsite]/prod6149174-product.json
2020-07-19 11:23:09 [scrapy.core.scraper] DEBUG: Scraped from <200 https://[testsite]/vpd/v1/products/prod6149174-product/ProductInfo+ProductDetails>
{'json_url': 'https://[testsite].com/vpd/v1/products/prod6149174-product/Prod
uctInfo+ProductDetails',
'local_json': [],
'url': 'https://[testsite].com/store/c/nature-made-super-b-complex,-tablets/
ID=prod6149174-product'}
2020-07-19 11:23:09 [scrapy.core.engine] INFO: Closing spider (finished)
2020-07-19 11:23:09 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 506,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 5515,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'elapsed_time_seconds': 0.468001,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 7, 19, 15, 23, 9, 96399),
'item_scraped_count': 1,
'log_count/DEBUG': 3,
'log_count/INFO': 14,
'response_received_count': 2,
'robotstxt/request_count': 1,
'robotstxt/response_count': 1,
'robotstxt/response_status_count/200': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2020, 7, 19, 15, 23, 8, 628398)}
2020-07-19 11:23:09 [scrapy.core.engine] INFO: Spider closed (finished)
I finally figured out the issue, which was the fact that the FilesPipeline
class does not have a from_crawler
method, but instead requires a from_settings
method when wanting to pass added parameters to a subclassed/custom FilesPipeline
. Below is my working version of the custom FilesPipeline
from scrapy import Request
from scrapy.pipelines.files import FilesPipeline
from urllib.parse import urlparse
import os
import re
class LocalFilesPipeline(FilesPipeline):
FILES_STORE = "data_src"
FILES_URLS_FIELD = "data_url"
FILES_RESULT_FIELD = "local_file"
def __init__(self, settings=None):
"""
Attributes:
use_response_url indicates we want to grab the filename from the response url instead of json_url
filename_regex regexes to use for grabbing filenames out of urls
filename_suffixes suffixes to append to filenames when there are multiple files to download per item
filename_extension the file extension to append to each filename in the file_path function
"""
self.use_response_url = settings.get('FILENAME_USE_RESPONSE_URL', False)
self.filename_regex = settings.get('FILENAME_REGEX', [])
self.filename_suffixes = settings.get('FILENAME_SUFFIXES', [])
self.filename_extension = settings.get('FILENAME_EXTENSION', 'json')
if isinstance(self.filename_regex, str):
self.filename_regex = [self.filename_regex]
if isinstance(self.filename_suffixes, str):
self.filename_suffixes = [self.filename_suffixes]
if self.filename_regex and self.filename_suffixes and len(self.filename_regex) != len(self.filename_suffixes):
raise ValueError('FILENAME_REGEX and FILENAME_SUFFIXES settings must contain the same number of elements')
if self.filename_regex:
for i, f_regex in enumerate(self.filename_regex):
self.filename_regex[i] = re.compile(f_regex)
super(LocalFilesPipeline, self).__init__(self.FILES_STORE, settings=settings)
@classmethod
def from_settings(cls, settings):
return cls(settings=settings)
def parse_path(self, value, index):
if self.filename_regex:
try:
return self.filename_regex[index-1].findall(value)[0]
except IndexError:
pass
# fallback method in the event no regex is provided by the spider
link_path = os.path.splitext(urlparse(value).path)[0]
# preserve the last portion separated by forward-slash
try:
return link_path.rsplit('/', 1)[1]
except IndexError:
return link_path
def get_media_requests(self, item, info):
file_urls = item.get(self.FILES_URLS_FIELD)
requests = []
if file_urls:
total_urls = len(file_urls)
for i, file_url in enumerate(file_urls, 1):
filename_url = file_url if not self.use_response_url else item.get('url', '')
filename = self.parse_path(filename_url, i)
if self.filename_suffixes:
current_suffix = self.filename_suffixes[i-1]
if current_suffix.startswith('/'):
# this will end up creating a separate folder for the different types of files
filename += current_suffix
else:
# this will keep all files in single folder while still making it easy to differentiate each
# type of file. this comes in handy when searching for a file by the base name.
filename += f'_{current_suffix}'
elif total_urls > 1:
# default to numbering files sequentially in the order they were added to the item
filename += f'_file{i}'
requests.append(Request(file_url, meta={'spider': info.spider.name, 'filename': filename}))
return requests
def file_path(self, request, response=None, info=None):
return f'{request.meta["spider"]}/{request.meta["filename"]}.{self.filename_extension}'
Then, to utilize the pipeline you can set the applicable values in a spider's custom_settings
property
custom_settings = {
'ITEM_PIPELINES': {
'spins.pipelines.LocalFilesPipeline': 200
},
'FILENAME_REGEX': [r'products\/(.+?)\/ProductInfo\+ProductDetails']
}