Search code examples
pythonscrapy

(Scrapy) How to pass a variable to the ItemLoader


I'm learning about Scrapy and can't find a solution for a question that I have following a tutorial.

I have this Item:

import scrapy


class ChocolateProduct(scrapy.Item):
    name = scrapy.Field()
    price = scrapy.Field()
    url = scrapy.Field()

This ItemLoader:

from itemloaders.processors import TakeFirst, MapCompose
from scrapy.loader import ItemLoader


class ChocolateProductLoader(ItemLoader):
    default_output_processor = TakeFirst()
    price_in = MapCompose(lambda x: x.split('£')[-1])
    url_in = MapCompose(lambda x: 'https://www.chocolate.co.uk' + x)

And this spider:

import scrapy
from urllib.parse import urlparse
from scrapeops_guide.itemsloaders import ChocolateProductLoader
from scrapeops_guide.items import ChocolateProduct


class ChocolateSpider(scrapy.Spider):
    name = 'chocolate_spider'
    allowed_domains = ['chocolate.co.uk']
    start_urls = ['https://www.chocolate.co.uk/collections/all']

    def parse(self, response, **kwargs):
        products = response.css('product-item')
        url = urlparse(response.url)
        url = f'{url.scheme}://{url.netloc}'
        for product in products:
            chocolate = ChocolateProductLoader(
                item=ChocolateProduct(),
                selector=product
            )
            chocolate.add_css('name', 'a.product-item-meta__title::text')
            chocolate.add_css('price', 'span.price',
                              re='<span class="price">\n              '
                                 '<span class="visually-hidden">Sale price'
                                 '</span>(.*)</span>')
            chocolate.add_css('url', 'div.product-item-meta a::attr(href)')
            yield chocolate.load_item()

        next_page = response.css('a[rel="next"]::attr(href)').get()
        if next_page:
            yield response.follow(next_page, callback=self.parse)

What I'm trying to do is not to use a fixed string inside the ItemLoader and make it dynamic. For example, replace the fixed string from the ItemLoader (url_in) with the value from the spider (url).

I already tried passing a parameter to the ChocolateProductLoader that is available in the context attribute of the Loader but I'm unable to access it inside the Loader. I'm open to other ways to achieve the same solution.


Solution

  • I found a solution. I realized that if I define a function with that name of the variable that I want on the ItemLoader it is executed. So I added the url variable to my Loader instance:

    chocolate = ChocolateProductLoader(
                    item=ChocolateProduct(),
                    selector=product,
                    url=f'{url.scheme}://{url.netloc}'
                )
    

    and the ItemLoader code I changed to this:

    from urllib.parse import urlparse
    
    from itemloaders.processors import TakeFirst, MapCompose
    from scrapy.loader import ItemLoader
    
    
    class ChocolateProductLoader(ItemLoader):
        default_output_processor = TakeFirst()
        price_in = MapCompose(lambda x: x.split('£')[-1])
    
        def url_in(self, values):
            return [f'{self.context.get("url")}{value}' for value in values]
    
    

    My final code looks like this:

    Item: No changes

    ItemLoader:

    from urllib.parse import urlparse
    
    from itemloaders.processors import TakeFirst, MapCompose
    from scrapy.loader import ItemLoader
    
    
    class ChocolateProductLoader(ItemLoader):
        default_output_processor = TakeFirst()
        price_in = MapCompose(lambda x: x.split('£')[-1])
    
        def url_in(self, values):
            parsed_url = urlparse(self.context.get("url"))
            base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
            return [f'{base_url}{value}' for value in values]
    
    

    Spider:

    import scrapy
    from scrapeops_guide.itemsloaders import ChocolateProductLoader
    from scrapeops_guide.items import ChocolateProduct
    
    
    class ChocolateSpider(scrapy.Spider):
        name = 'chocolate_spider'
        allowed_domains = ['chocolate.co.uk']
        start_urls = ['https://www.chocolate.co.uk/collections/all']
    
        def parse(self, response, **kwargs):
            products = response.css('.product-item')
            for product in products:
                chocolate = ChocolateProductLoader(
                    item=ChocolateProduct(),
                    selector=product,
                    url=response.url
                )
                chocolate.add_css('name', 'a.product-item-meta__title::text')
                chocolate.add_css('price', 'span.price',
                                  re='<span class="price">\n              '
                                     '<span class="visually-hidden">Sale price'
                                     '</span>(.*)</span>')
                chocolate.add_css('url', 'div.product-item-meta a::attr(href)')
                yield chocolate.load_item()
    
            next_page = response.css('a[rel="next"]::attr(href)').get()
            if next_page:
                yield response.follow(next_page, callback=self.parse)