Without overwriting the file_path function, the spider download all the images with the default 'request URL hash' filenames. However when I try to overwrite the function it just doesn't work. There is nothing in the default output attribute, images.
I have tried both relative and absolute paths for the IMAGES_STORE variable in settings.py as well as the file_path function to no avail. Even when I overwrite the file_path function with the exact same default file_path function, the images do not download.
Any help would be much appreciated!
BOT_NAME = 'HomeApp2'
SPIDER_MODULES = ['HomeApp2.spiders']
NEWSPIDER_MODULE = 'HomeApp2.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36'
# ScrapySplash settings
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
'HomeApp2.pipelines.DuplicatesPipeline': 250,
'HomeApp2.pipelines.ProcessImagesPipeline': 251,
'HomeApp2.pipelines.HomeApp2Pipeline': 300,
IMAGES_STORE = 'files'
import json
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class DuplicatesPipeline(object):
def __init__(self):
self.sku_seen = set()
def process_item(self, item, spider):
if item['sku'] in self.sku_seen:
raise DropItem("Repeated item found: %s" % item)
return item
class ProcessImagesPipeline(ImagesPipeline):
def file_path(self, request):
sku = request.meta['sku']
num = request.meta['num']
return '%s/%s.jpg' % (sku, num)
def get_media_requests(self, item, info):
print('- - - - - - - - - - - - - - - - - -')
sku = item['sku']
for num, image_url in item['image_urls'].items():
yield scrapy.Request(url=image_url, meta = {'sku': sku,
'num': num})
class HomeApp2Pipeline(object):
def __init__(self):
self.file = open('items.jl', 'w')
def process_item(self, item, spider):
line = json.dumps(dict(item)) + '\n'
return item
import scrapy
from scrapy_splash import SplashRequest
from HomeApp2.items import HomeAppItem
class AppScrape2Spider(scrapy.Spider):
name = 'AppScrape2'
def start_requests(self):
yield SplashRequest(
url = 'https://www.appliancesonline.com.au/product/samsung-sr400lstc-400l-top-mount-fridge?sli_sku_jump=1',
callback = self.parse,
def parse(self, response):
item = HomeAppItem()
product = response.css('aol-breadcrumbs li:nth-last-of-type(1) .breadcrumb-link ::text').extract_first().rsplit(' ', 1)
if product is None:
return {}
item['sku'] = product[-1]
item['image_urls'] = {}
root_url = 'https://www.appliancesonline.com.au'
product_picture_count = 0
for pic in response.css('aol-product-media-gallery-main-image-portal img.image'):
product_picture_count = product_picture_count + 1
item['image_urls']['p'+str(product_picture_count)] = (
root_url + pic.css('::attr(src)').extract_first())
feature_count = 0
for feat in response.css('aol-product-features .feature'):
feature_count = feature_count + 1
item['image_urls']['f'+str(feature_count)] = (
root_url + feat.css('.feature-image ::attr(src)').extract_first())
yield item
import scrapy
class HomeAppItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
sku = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
After much trial and error, I found the solution. It was simply adding the rest of the parameters to the file_path method.
def file_path(self, request):
def file_path(self, request, response=None, info=None):
It seems that the my original code overrode the method incorrectly causing calls to the method to fail.