Search code examples
pythonscrapy

Scrapy - Trying to iterate into product page


So I am trying to iterate from an all products page > category > series > product page. I am getting an error in the log, where it shows that I am not retrieving the expected id, but I think it has to do with how I am iterating into the pages, my suspicion is that I am not traveling all the way to the products page.

Start Request

def start_requests(self):
        urls = [
            'https://www.moxa.com/en/products',
        ]
        for url in urls:
            yield scrapy.Request(url, callback=self.parse)

Initial Parse into Products Page

def parse(self, response):
        # iterate through each of the relative urls
        for explore_products in response.css('li.alphabet-list--no-margin a.alphabet-list__link::attr(href)').getall():
            category_url = response.urljoin(explore_products)  # use variable
            logging.info("Category_links: " + category_url)
            yield scrapy.Request(category_url, callback=self.parse_categories)

2nd Parse for Series

    def parse_categories(self, response):
        for category_url in response.css('a.series-card__wrapper::attr(href)').getall():
            series_url = response.urljoin(category_url)
            logging.info("Series_links: " + series_url)
            yield scrapy.Request(series_url, callback=self.parse_series)

3rd Part to reach product page itself (I think this is where it is breaking) I would like it that if its possible to check if the "target_id" is within the series_url that it only returns the passing results into the "product_url" List Example - target_id: TN-5916-WV-T, and the product_url: https://www.moxa.com/Products/INDUSTRIAL-NETWORK-INFRASTRUCTURE/Secure-Routers/EN-50155-Routers/TN-5900-Series/TN-5916-WV-T, it should pass as true and be passed into the product_links list. But if the product_url: https://www.moxa.com/en/products/quotation-list, then it does not pass and does not return into the list.

    def parse_series(self, response):
        for series_url in response.css('.model-table a::attr(href)').getall():
            target_list = response.xpath('//table[@class="model-table"]//a/@href').getall()
            target_id = response.css('table.model-table th::attr(data-id)').get()            
            target_path = [p for p in target_list if target_id in p]
            product_url = response.urljoin(series_url)
            self.logger.info("target_id: " + target_id)
            self.logger.info("product_url: " + product_url)
            logging.info("Product_links: " + product_url)
            yield scrapy.Request(product_url, callback=self.parse_new_item)

Return the expected item results

    def parse_new_item(self, response):
        for product in response.css('section.main-section'):
            items = MoxaItem() # Unique item for each iteration
            items['product_link'] = response.url # get the product link from response
            name_dirty = product.css('h5.series-card__heading.series-card__heading--big::text').get()
            product_sku = name_dirty.strip()
            product_store_description = product.css('p.series-card__intro').get() 
            product_sub_title = product_sku + ' ' + product_store_description
            summary = product.css(('section.features h3 + ul')).getall()
            datasheet = product.css(('li.side-section__item a::attr(href)'))
            description =   product.css('.products .product-overview::text').getall()
            specification = product.css('div.series-card__table').getall()
            products_zoom_image = name_dirty.strip() + '.jpg'
            main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
            # weight = product.xpath('//div[@class="series-card__table"]//p[@class="title-list__heading"]/text()[contains(., "Weight")]following-sibiling::div//text()').get()
            response.xpath("//div[@class='grdcpnsmllnks']//li[i[contains(@class, 'fa-clock-o')]]/text()").re_first(r"Valid till\s+(\d+/\d+/\d+)")
            rel_links = product.xpath("//script/@src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
            
            items['product_sku'] = product_sku,
            items['product_sub_title'] = product_sub_title,
            items['summary'] = summary,
            items['description'] = description,
            items['specification'] = specification,
            items['products_zoom_image'] = products_zoom_image
            items['main_image'] = main_image,
            # items['weight'] = weight,
            #items['rel_links'] = rel_links,
            items['datasheet'] = datasheet,
            yield items

My log where the error is appearing

 File "/home/joel/Desktop/moxa/moxa/spiders/product_series.py", line 57, in parse_new_item
    logging.info("name_dirty: " + name_dirty)
TypeError: can only concatenate str (not "NoneType") to str

Solution

  • Try doing it this way... I think this is a better way to make sure you are grabing the correct links. Doing it this way also eliminates the many duplicates that were being generated before.

    def parse_series(self, response):
        for column in response.xpath("//table[@class='model-table']//th"): # iterate columns
            data_id = column.xpath("./@data-id") # grab the columns data_id
            if not data_id:    # Check if data id exists because the first column is
                continue       # is for the labels and doesn't have a data_id
            data_id = data_id.get()    #  get the data_id text
            for link in column.xpath("//a/@href").getall():  # crawl the links in the
                if data_id not in link:                      # column and if the
                    continue                                 # data_id is in the url
                url = response.urljoin(link)                 # merge with the domain
                yield scrapy.Request(url, callback=self.parse_new_item)   # yield request