Search code examples
pythonscrapyfilenames

There is a problem in pipelines file as it is not fetching the book name. instead saving one random image with None.jpg for each crawl


items.py file. as i was aware of the image_urls and images field. It didn't create any problem.

import scrapy
from scrapy.loader.processors import TakeFirst

class BooksToScrapeItem(scrapy.Item):
    image_urls = scrapy.Field()
    images = scrapy.Field()
    book_name = scrapy.Field(
        output_processor = TakeFirst()
    )

pipelines.py file. I think there must be a problem in the get_media_request method as it is not fetching the book_name from items file

from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request


class BooksToScrapeImagePipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        return [Request(x,meta={'bookname': item.get('book_name')}) for x in item.get(self.images_urls_field, [])] #i think that the problem is in this line

    def file_path(self, request, response=None, info=None):

        return 'full/%s.jpg' % (request.meta['bookname'])

spider file which I user for scraping.It worked when i didnt customized the pipeline file.

import scrapy
from scrapy.loader import ItemLoader
from books_to_scrape.items import BooksToScrapeItem

class ImgscrapeSpider(scrapy.Spider):
    name = 'imgscrape'
    allowed_domains = ['books.toscrape.com']
    start_urls = ['http://books.toscrape.com']

    def parse(self, response):
        for article in response.xpath("//article[@class='product_pod']"):
            loader = ItemLoader(item=BooksToScrapeItem(),selector=article)
            relative_url = article.xpath(".//div/a/img[@class='thumbnail']/@src").extract_first()
            abs_url = response.urljoin(relative_url)
            loader.add_value('image_urls',abs_url)
            loader.add_xpath('book_name',".//article[@class='product_pod']/h3/a/text()") 
            yield loader.load_item() 

Solution

  • Your problem is in relative xpath

    loader.add_xpath('book_name', ".//article[@class='product_pod']/h3/a/text()") 
    

    Loader uses xpath("//article[@class='product_pod']") as selector

       for article in response.xpath("//article[@class='product_pod']"):
            loader = ItemLoader(item=BooksToScrapeItem(), selector=article)
    

    so all relative xpaths are relative to "//article[@class='product_pod']" and they don't need "//article[@class='product_pod']" in xpath.

    Using relative xpath ".//article[@class='product_pod']/h3/a/text()" it couldn't find title so book_name was empty for all items and for all items it used None as title - and it used the same name None.jpg for all images.


    It has to be

    loader.add_xpath('book_name', ".//h3/a/text()")  # title with `...`
    

    BTW: text() doesn't have full title but with .... To get full title you has to get attribute title=

    loader.add_xpath('book_name', ".//h3/a/@title")  # full title
    

    I created version with all code in one file to run it without creating project.

    Everyone can copy it to single file and run to test it.

    import scrapy
    from scrapy.loader.processors import TakeFirst
    
    class BooksToScrapeItem(scrapy.Item):
        image_urls = scrapy.Field()
        images = scrapy.Field()
        book_name = scrapy.Field(
            output_processor = TakeFirst()
        )
    
    from scrapy import Request
    from scrapy.pipelines.images import ImagesPipeline
    
    class BooksToScrapeImagePipeline(ImagesPipeline):
    
        def get_media_requests(self, item, info):
            return [Request(x, meta={'bookname': item.get('book_name')}) for x in item.get(self.images_urls_field, [])] #i think that the problem is in this line
    
        def file_path(self, request, response=None, info=None):
            return 'full/%s.jpg' % request.meta['bookname']
    
    from scrapy.loader import ItemLoader
    
    class ImgscrapeSpider(scrapy.Spider):
        name = 'imgscrape'
        allowed_domains = ['books.toscrape.com']
        start_urls = ['http://books.toscrape.com']
    
        def parse(self, response):
            for article in response.xpath("//article[@class='product_pod']"):
    
                loader = ItemLoader(item=BooksToScrapeItem(),selector=article)
    
                relative_url = article.xpath(".//div/a/img[@class='thumbnail']/@src").extract_first()
                abs_url = response.urljoin(relative_url)
    
                loader.add_value('image_urls', abs_url)
                #loader.add_xpath('book_name',".//article[@class='product_pod']/h3/a/text()")  # wrong relative xpath 
                #loader.add_xpath('book_name', ".//h3/a/text()")  # only partial title
                loader.add_xpath('book_name', ".//h3/a/@title")  # full title
    
                yield loader.load_item() 
    
    # -----------------------------------------------------------------------------
    
    from scrapy.crawler import CrawlerProcess
    
    c = CrawlerProcess({
        'USER_AGENT': 'Mozilla/5.0',
    
        # save in file CSV, JSON or XML
        'FEED_FORMAT': 'csv',     # csv, json, xml
        'FEED_URI': 'output.csv', #
    
        # download images to `IMAGES_STORE/full` (standard folder) and convert to JPG (even if it is already JPG)
        # it needs `yield {'image_urls': [url]}` in `parse()` and both ITEM_PIPELINES and IMAGES_STORE to work
    
        'ITEM_PIPELINES': {'__main__.BooksToScrapeImagePipeline': 1},            # used Pipeline create in current file (needs __main___)
        'IMAGES_STORE': '.',                   # this folder has to exist before downloading
    
    })
    c.crawl(ImgscrapeSpider)
    c.start()