Search code examples
python-3.xfunctionweb-scrapingscrapy

How to gather item data from different pages in scrapy


I tried to collect data from a webpage about quotes and then click in the about author sections to collect more data from that page, and I want it to be stored in same row in the item class of scrapy.

code :

    def parse(self, response):
        qs = response.css('.quote')
        self.quote_item = QuotesItem()

        for q in qs:
            page_url = q.css('span a').attrib['href']
            full_page_url = 'https://quotes.toscrape.com' + page_url

            # tags
            t = []
            tags = q.css('.tag')
            for tag in tags:
                t.append(tag.css('::text').get())

            # items

            self.quote_item['quote'] = q.css('.text ::text').get(),
            self.quote_item['tag'] = t,
            self.quote_item['author'] = q.css('span .author ::text').get(),
            yield response.follow(full_page_url, callback=self.parse_page, cb_kwargs={'item': self.quote_item})
            yield self.quote_item




    def parse_page(self, response, item):
        q = response.css('.author-details')
        n = {}
        n['birth_date'] = q.css('p .author-born-date ::text').get()
        n['birth_location'] = q.css('p .author-born-location ::text').get()
        item.update(n)

As a result the data collected doesn't include birth_date and birth_location. Output example:

{"quote": ["“One good thing about music, when it hits you, you feel no pain.”"], "tag": [["music"]], "author": ["Bob Marley"]}

I'm looking if there is a way that I can add the output of second function to first without creating a new row (I tried to yield the output from second function that will make it show as a new line).


Solution

    1. You need to create an item every iteration.
    2. Yield the item in the last callback method.
    3. Next time search SO first, it was answered multiple times already.
    def parse(self, response):
        qs = response.css('.quote')
        
        for q in qs:
            page_url = q.css('span a').attrib['href']
            full_page_url = 'https://quotes.toscrape.com' + page_url
    
            # tags
            t = []
            tags = q.css('.tag')
            for tag in tags:
                t.append(tag.css('::text').get())
    
            # items
            quote_item = QuotesItem()
            quote_item['quote'] = q.css('.text ::text').get(),
            quote_item['tag'] = t,
            quote_item['author'] = q.css('span .author ::text').get(),
            yield response.follow(full_page_url, callback=self.parse_page, cb_kwargs={'item': quote_item})
    
    
    def parse_page(self, response, item):
        q = response.css('.author-details')    
        item['birth_date'] = q.css('p .author-born-date ::text').get()
        item['birth_location'] = q.css('p .author-born-location ::text').get()
        yield item