I tried to collect data from a webpage about quotes and then click in the about author sections to collect more data from that page, and I want it to be stored in same row in the item class of scrapy.
code :
def parse(self, response):
qs = response.css('.quote')
self.quote_item = QuotesItem()
for q in qs:
page_url = q.css('span a').attrib['href']
full_page_url = 'https://quotes.toscrape.com' + page_url
# tags
t = []
tags = q.css('.tag')
for tag in tags:
t.append(tag.css('::text').get())
# items
self.quote_item['quote'] = q.css('.text ::text').get(),
self.quote_item['tag'] = t,
self.quote_item['author'] = q.css('span .author ::text').get(),
yield response.follow(full_page_url, callback=self.parse_page, cb_kwargs={'item': self.quote_item})
yield self.quote_item
def parse_page(self, response, item):
q = response.css('.author-details')
n = {}
n['birth_date'] = q.css('p .author-born-date ::text').get()
n['birth_location'] = q.css('p .author-born-location ::text').get()
item.update(n)
As a result the data collected doesn't include birth_date
and birth_location
.
Output example:
{"quote": ["“One good thing about music, when it hits you, you feel no pain.”"], "tag": [["music"]], "author": ["Bob Marley"]}
I'm looking if there is a way that I can add the output of second function to first without creating a new row (I tried to yield the output from second function that will make it show as a new line).
def parse(self, response):
qs = response.css('.quote')
for q in qs:
page_url = q.css('span a').attrib['href']
full_page_url = 'https://quotes.toscrape.com' + page_url
# tags
t = []
tags = q.css('.tag')
for tag in tags:
t.append(tag.css('::text').get())
# items
quote_item = QuotesItem()
quote_item['quote'] = q.css('.text ::text').get(),
quote_item['tag'] = t,
quote_item['author'] = q.css('span .author ::text').get(),
yield response.follow(full_page_url, callback=self.parse_page, cb_kwargs={'item': quote_item})
def parse_page(self, response, item):
q = response.css('.author-details')
item['birth_date'] = q.css('p .author-born-date ::text').get()
item['birth_location'] = q.css('p .author-born-location ::text').get()
yield item