I'm learning about Scrapy and can't find a solution for a question that I have following a tutorial.
I have this Item:
import scrapy
class ChocolateProduct(scrapy.Item):
name = scrapy.Field()
price = scrapy.Field()
url = scrapy.Field()
This ItemLoader:
from itemloaders.processors import TakeFirst, MapCompose
from scrapy.loader import ItemLoader
class ChocolateProductLoader(ItemLoader):
default_output_processor = TakeFirst()
price_in = MapCompose(lambda x: x.split('£')[-1])
url_in = MapCompose(lambda x: 'https://www.chocolate.co.uk' + x)
And this spider:
import scrapy
from urllib.parse import urlparse
from scrapeops_guide.itemsloaders import ChocolateProductLoader
from scrapeops_guide.items import ChocolateProduct
class ChocolateSpider(scrapy.Spider):
name = 'chocolate_spider'
allowed_domains = ['chocolate.co.uk']
start_urls = ['https://www.chocolate.co.uk/collections/all']
def parse(self, response, **kwargs):
products = response.css('product-item')
url = urlparse(response.url)
url = f'{url.scheme}://{url.netloc}'
for product in products:
chocolate = ChocolateProductLoader(
item=ChocolateProduct(),
selector=product
)
chocolate.add_css('name', 'a.product-item-meta__title::text')
chocolate.add_css('price', 'span.price',
re='<span class="price">\n '
'<span class="visually-hidden">Sale price'
'</span>(.*)</span>')
chocolate.add_css('url', 'div.product-item-meta a::attr(href)')
yield chocolate.load_item()
next_page = response.css('a[rel="next"]::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
What I'm trying to do is not to use a fixed string inside the ItemLoader and make it dynamic. For example, replace the fixed string from the ItemLoader (url_in) with the value from the spider (url).
I already tried passing a parameter to the ChocolateProductLoader that is available in the context attribute of the Loader but I'm unable to access it inside the Loader. I'm open to other ways to achieve the same solution.
I found a solution. I realized that if I define a function with that name of the variable that I want on the ItemLoader it is executed. So I added the url variable to my Loader instance:
chocolate = ChocolateProductLoader(
item=ChocolateProduct(),
selector=product,
url=f'{url.scheme}://{url.netloc}'
)
and the ItemLoader code I changed to this:
from urllib.parse import urlparse
from itemloaders.processors import TakeFirst, MapCompose
from scrapy.loader import ItemLoader
class ChocolateProductLoader(ItemLoader):
default_output_processor = TakeFirst()
price_in = MapCompose(lambda x: x.split('£')[-1])
def url_in(self, values):
return [f'{self.context.get("url")}{value}' for value in values]
My final code looks like this:
Item: No changes
ItemLoader:
from urllib.parse import urlparse
from itemloaders.processors import TakeFirst, MapCompose
from scrapy.loader import ItemLoader
class ChocolateProductLoader(ItemLoader):
default_output_processor = TakeFirst()
price_in = MapCompose(lambda x: x.split('£')[-1])
def url_in(self, values):
parsed_url = urlparse(self.context.get("url"))
base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
return [f'{base_url}{value}' for value in values]
Spider:
import scrapy
from scrapeops_guide.itemsloaders import ChocolateProductLoader
from scrapeops_guide.items import ChocolateProduct
class ChocolateSpider(scrapy.Spider):
name = 'chocolate_spider'
allowed_domains = ['chocolate.co.uk']
start_urls = ['https://www.chocolate.co.uk/collections/all']
def parse(self, response, **kwargs):
products = response.css('.product-item')
for product in products:
chocolate = ChocolateProductLoader(
item=ChocolateProduct(),
selector=product,
url=response.url
)
chocolate.add_css('name', 'a.product-item-meta__title::text')
chocolate.add_css('price', 'span.price',
re='<span class="price">\n '
'<span class="visually-hidden">Sale price'
'</span>(.*)</span>')
chocolate.add_css('url', 'div.product-item-meta a::attr(href)')
yield chocolate.load_item()
next_page = response.css('a[rel="next"]::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)