So i have a scrapy spider as follows:
class CoursesSpider(scrapy.Spider):
name = "courses"
start_urls = [
'http://example.com'
]
def parse(self, response):
for subject in response.css('subject'):
subject_name = subject.css('subject::text').extract_first().strip()
subject_link = subject.css('subject::attr(href)').extract_first().strip()
subject_id = subject.css('subject::attr(id)').extract_first().strip()
if subject_link is not None:
subject_data = scrapy.Request(subject_link, callback=self.parse_course)
yield {
'subject_name': subject_name,
'subject_link': subject_link,
'subject_id': subject_id,
'subject_data': subject_data,
}
def parse_course(self, response):
subject_id = response.css('::attr(id)').extract_first().strip()
for course in response.css('course'):
course_name = course.css('course::text').extract_first().strip()
course_link = course.css('course::attr(href)').extract_first().strip()
course_id = course.css('course::attr(id)').extract_first().strip()
if course_link is not None:
course_data = scrapy.Request(course_link, callback=self.parse_class)
yield {
'course_name': course_name,
'course_link': course_link,
'course_id': subject_id + " " + course_id,
'course_data': course_data,
}
def parse_class(self, response):
course_id = response.css('::attr(id)').extract_first().strip()
for section in response.css('section'):
section_name = section.css('section::text').extract_first().strip()
section_link = section.css('section::attr(href)').extract_first().strip()
yield {
'section_name': section_name,
'section_link': section_link,
'course_id': course_id,
}
I'd like to get an output json file that has a tree structure like so:
{"subject_id": "...", "subject_name": "...", "subject_link": "...", "subject_data":
{"course_id": "...", "course_link": "...", "course_name": "...", "course_data":
{"course_id": "...", "section_link": "...", "section_name": "..."}
}
}
However i only get this:
{"subject_id": "...", "subject_data": "<Request GET http://example.com/something>", "subject_name": "...", "subject_link": "..."}
From my understanding this is because the yield code didn't get executed yet. How would i go about calling a equivalent of "scrapy crawl courses -o courses.json" that fully calls all requests? If that's not possible out-of-the-box how can i do this myself? Can i later import the json in a python file and run http://example.com/something> and the following ones somehow?
I know there's a lot of code, but it should clarify. Thanks for your help!
I see 2 ways of doing this:
Request.meta
dict. See Passing additional data to callback functionsor
Method 1.
class CoursesSpider(scrapy.Spider):
name = "courses"
start_urls = [
'http://example.com'
]
def parse(self, response):
for subject in response.css('subject'):
subject_name = subject.css('subject::text').extract_first().strip()
subject_link = subject.css('subject::attr(href)').extract_first().strip()
subject_id = subject.css('subject::attr(id)').extract_first().strip()
if subject_link is not None:
subject_data = scrapy.Request(subject_link, callback=self.parse_course)
# build a dict with the info we have so far
subject_info = {
'subject_name': subject_name,
'subject_link': subject_link,
'subject_id': subject_id,
}
# add this to the Request's meta dict
subject_data.meta['subject_info'] = subject_info
# ask Scrapy to fetch additional data
yield subject_data
def parse_course(self, response):
# get back the data that was passed previously
subject_info = response.request.meta['subject_info']
subject_id = response.css('::attr(id)').extract_first().strip()
for course in response.css('course'):
course_name = course.css('course::text').extract_first().strip()
course_link = course.css('course::attr(href)').extract_first().strip()
course_id = course.css('course::attr(id)').extract_first().strip()
if course_link is not None:
course_data = scrapy.Request(course_link, callback=self.parse_class)
# build a dict with the data in this page
# + the data scraped previously
course_info = {
'course_name': course_name,
'course_link': course_link,
'course_id': subject_id + " " + course_id,
'subject_info': subject_info,
}
# pass that data to the next callback
course_data.meta['course_info'] = subject_info
# fetch the class page
yield course_data
def parse_class(self, response):
# get course data from previous callbacks
course_info = response.request.meta['course_info']
course_id = response.css('::attr(id)').extract_first().strip()
for section in response.css('section'):
section_name = section.css('section::text').extract_first().strip()
section_link = section.css('section::attr(href)').extract_first().strip()
yield {
'section_name': section_name,
'section_link': section_link,
'course_id': course_id,
'course_info': course_info
}
So you will not get subjects containing courses, themselves containing sections, rather sections, each having info on what courses they belong to, themselves having info on which subject they relate to.
Method 2. (Warning: I have not tested this in practice but it may work)
from inline_requests import inline_requests
class CoursesSpider(scrapy.Spider):
name = "courses"
start_urls = [
'http://example.com'
]
# this decorator is important
@inline_requests
def parse(self, response):
for subject in response.css('subject'):
subject_name = subject.css('subject::text').extract_first().strip()
subject_link = subject.css('subject::attr(href)').extract_first().strip()
subject_id = subject.css('subject::attr(id)').extract_first().strip()
# this list will collect information on courses for this subject
subject_data = []
if subject_link is not None:
try:
# you ask scrapy to fetch the page
# but you do not set a callback
subject_response = yield scrapy.Request(subject_link)
# and you get a Response to work on when it's fetched,
# without going through a callback
subject_id = subject_response.css('::attr(id)').extract_first().strip()
for course in subject_response.css('course'):
course_name = course.css('course::text').extract_first().strip()
course_link = course.css('course::attr(href)').extract_first().strip()
course_id = course.css('course::attr(id)').extract_first().strip()
# this list will collect information on sections for this course
course_data = []
if course_link is not None:
try:
# same thing here, you ask Scrapy to fetch a Response
course_response = yield scrapy.Request(course_link)
course_id = course_response.css('::attr(id)').extract_first().strip()
for section in course_response.css('section'):
section_name = section.css('section::text').extract_first().strip()
section_link = section.css('section::attr(href)').extract_first().strip()
# add each section item
course_data.append(
{
'section_name': section_name,
'section_link': section_link,
'course_id': course_id,
}
)
except:
raise
# add each course item
subject_data.append(
{
'course_name': course_name,
'course_link': course_link,
'course_id': subject_id + " " + course_id,
'course_data': course_data,
}
)
except:
raise
yield {
'subject_name': subject_name,
'subject_link': subject_link,
'subject_id': subject_id,
'subject_data': subject_data,
}