Search code examples
pythonselenium-webdriverscrapy

Python scrape variations with multiple combinations for size and colour


I am trying to scrape the variants sizes and colour.

Here is the following scenario below:

Selected Colour: -Mantis Green -Spool Yellow

Selected Size: -6lb -8lb -10lb -15lb -20lb -30lb

I need to scrape the title, price, and special price, with the following code below.

import scrapy
import re
from scrapy_splash import SplashRequest

class FishingRodsSpider(scrapy.Spider):
    name = "ana_rods_detailed"
    allowed_domains = ["anacondastores.com"]
    start_urls = ["https://www.anacondastores.com/fishing/fishing-line/braid-line/shimano-kairiki-8-braid-line-150-metre-spool/BP90140299"]

    def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(url, self.parse, args={'wait': 2})

    def parse(self, response):
        title = response.css('.pdp-title::text').get().strip()
        price = response.css('.product-info .price-was .amount::text').get().strip()
        club_price = response.css('.product-info .price-now .amount::text').get().strip()
        product_details = response.css('.product-details-list')
        base_url = "https://www.anacondastores.com"
        variant_style_pickers = response.css('.js-variant-style-picker')
        variant_size_pickers = response.css('.js-variant-size')
        style_data_urls = []
        for i, style_picker in enumerate(variant_style_pickers):
            style_data_url = style_picker.attrib.get('data-url')
            if i == 0 and style_data_url is None:
                for size_picker in variant_size_pickers:
                    size_data_url = size_picker.attrib.get('data-url')
                    if size_data_url is not None:
                        size_variant_url = base_url + size_data_url
                        # self.log("Size Variant: " + size_variant_url)
                    else:
                        self.log("Size data-url attribute is missing for a size picker.")
            else:
                if style_data_url is not None:
                    style_data_urls.append(base_url + style_data_url)
                    for size_picker in variant_size_pickers:
                        size_data_url = size_picker.attrib.get('data-url')
                        if size_data_url is not None:
                            size_variant_url = base_url + size_data_url
                            self.log("Size Variant: " + size_variant_url)
                else:
                    self.log("Style data-url attribute is missing for a style picker.")

Expected Result -Mantis Green 6lb -Mantis Green 8lb -Mantis Green 10lb -Mantis Green 20lb -Mantis Green 30lb -Spool Yellow 6lb -Spool Yellow 8lb -Spool Yellow 10lb -Spool Yellown 20lb -Spool Yellow 30lb


Solution

  • The data for this website is available in the loaded page so no need to use scrapy-splash. You need to check the network activity to find the url that is generated when you click on a color or size option. See below sample code:

    import scrapy
    
    
    class AnacondaSpider(scrapy.Spider):
        name = "anaconda"
        allowed_domains = ["www.anacondastores.com"]
        start_urls = [
            "https://www.anacondastores.com/fishing/fishing-line/braid-line/shimano-kairiki-8-braid-line-150-metre-spool/BP90140299-mantis-green"
        ]
    
        def parse(self, response):
            # get all colors and scrape them
            color_urls = response.css(".js-variant-style-picker::attr(data-url)").getall()
            for url in color_urls:
                yield response.follow(url)
    
            # get all the sizes and scrape them
            size_codes = response.css(
                ".size-variant a::attr(data-variant-size-code)"
            ).getall()
    
            for code in size_codes:
                url = response.urljoin(code + "?version=7")
                yield scrapy.Request(url, callback=self.parse_size)
    
        def parse_size(self, response):
            item = dict()
    
            item["title"] = response.css(
                "#productContentWrapper > div::attr(data-product-name)"
            ).get()
            item["price"] = response.css(
                "#productContentWrapper > div::attr(data-product-metric1)"
            ).get()
            item["sale_price"] = response.css(
                "#productContentWrapper > div::attr(data-product-price)"
            ).get()
            item["size"] = response.css(
                "#productContentWrapper > div::attr(data-product-dimension13)"
            ).get()
    
            yield item