There is a problem in pipelines file as it is not fetching the book name. instead saving one random image with None.jpg for each crawl

items.py file. as i was aware of the image_urls and images field. It didn't create any problem.

import scrapy
from scrapy.loader.processors import TakeFirst

class BooksToScrapeItem(scrapy.Item):
    image_urls = scrapy.Field()
    images = scrapy.Field()
    book_name = scrapy.Field(
        output_processor = TakeFirst()
    )

pipelines.py file. I think there must be a problem in the get_media_request method as it is not fetching the book_name from items file

from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request


class BooksToScrapeImagePipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        return [Request(x,meta={'bookname': item.get('book_name')}) for x in item.get(self.images_urls_field, [])] #i think that the problem is in this line

    def file_path(self, request, response=None, info=None):

        return 'full/%s.jpg' % (request.meta['bookname'])

spider file which I user for scraping.It worked when i didnt customized the pipeline file.

import scrapy
from scrapy.loader import ItemLoader
from books_to_scrape.items import BooksToScrapeItem

class ImgscrapeSpider(scrapy.Spider):
    name = 'imgscrape'
    allowed_domains = ['books.toscrape.com']
    start_urls = ['http://books.toscrape.com']

    def parse(self, response):
        for article in response.xpath("//article[@class='product_pod']"):
            loader = ItemLoader(item=BooksToScrapeItem(),selector=article)
            relative_url = article.xpath(".//div/a/img[@class='thumbnail']/@src").extract_first()
            abs_url = response.urljoin(relative_url)
            loader.add_value('image_urls',abs_url)
            loader.add_xpath('book_name',".//article[@class='product_pod']/h3/a/text()") 
            yield loader.load_item()

Solution

Your problem is in relative xpath

loader.add_xpath('book_name', ".//article[@class='product_pod']/h3/a/text()")

Loader uses xpath("//article[@class='product_pod']") as selector

   for article in response.xpath("//article[@class='product_pod']"):
        loader = ItemLoader(item=BooksToScrapeItem(), selector=article)

so all relative xpaths are relative to "//article[@class='product_pod']" and they don't need "//article[@class='product_pod']" in xpath.

Using relative xpath ".//article[@class='product_pod']/h3/a/text()" it couldn't find title so book_name was empty for all items and for all items it used None as title - and it used the same name None.jpg for all images.

It has to be

loader.add_xpath('book_name', ".//h3/a/text()")  # title with `...`

BTW: text() doesn't have full title but with .... To get full title you has to get attribute title=

loader.add_xpath('book_name', ".//h3/a/@title")  # full title

I created version with all code in one file to run it without creating project.

Everyone can copy it to single file and run to test it.

import scrapy
from scrapy.loader.processors import TakeFirst

class BooksToScrapeItem(scrapy.Item):
    image_urls = scrapy.Field()
    images = scrapy.Field()
    book_name = scrapy.Field(
        output_processor = TakeFirst()
    )

from scrapy import Request
from scrapy.pipelines.images import ImagesPipeline

class BooksToScrapeImagePipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        return [Request(x, meta={'bookname': item.get('book_name')}) for x in item.get(self.images_urls_field, [])] #i think that the problem is in this line

    def file_path(self, request, response=None, info=None):
        return 'full/%s.jpg' % request.meta['bookname']

from scrapy.loader import ItemLoader

class ImgscrapeSpider(scrapy.Spider):
    name = 'imgscrape'
    allowed_domains = ['books.toscrape.com']
    start_urls = ['http://books.toscrape.com']

    def parse(self, response):
        for article in response.xpath("//article[@class='product_pod']"):

            loader = ItemLoader(item=BooksToScrapeItem(),selector=article)

            relative_url = article.xpath(".//div/a/img[@class='thumbnail']/@src").extract_first()
            abs_url = response.urljoin(relative_url)

            loader.add_value('image_urls', abs_url)
            #loader.add_xpath('book_name',".//article[@class='product_pod']/h3/a/text()")  # wrong relative xpath 
            #loader.add_xpath('book_name', ".//h3/a/text()")  # only partial title
            loader.add_xpath('book_name', ".//h3/a/@title")  # full title

            yield loader.load_item() 

# -----------------------------------------------------------------------------

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',

    # save in file CSV, JSON or XML
    'FEED_FORMAT': 'csv',     # csv, json, xml
    'FEED_URI': 'output.csv', #

    # download images to `IMAGES_STORE/full` (standard folder) and convert to JPG (even if it is already JPG)
    # it needs `yield {'image_urls': [url]}` in `parse()` and both ITEM_PIPELINES and IMAGES_STORE to work

    'ITEM_PIPELINES': {'__main__.BooksToScrapeImagePipeline': 1},            # used Pipeline create in current file (needs __main___)
    'IMAGES_STORE': '.',                   # this folder has to exist before downloading

})
c.crawl(ImgscrapeSpider)
c.start()