Search code examples
pythonjsonyieldscrapy

Scrapy output file that recursively runs all the yield requests - how to


So i have a scrapy spider as follows:

class CoursesSpider(scrapy.Spider):
name = "courses"
start_urls = [
    'http://example.com'
]

def parse(self, response):
    for subject in response.css('subject'):

        subject_name = subject.css('subject::text').extract_first().strip()
        subject_link = subject.css('subject::attr(href)').extract_first().strip()
        subject_id = subject.css('subject::attr(id)').extract_first().strip()

        if subject_link is not None:
            subject_data = scrapy.Request(subject_link, callback=self.parse_course)

        yield {
            'subject_name': subject_name,
            'subject_link': subject_link,
            'subject_id': subject_id,
            'subject_data': subject_data,
        }


def parse_course(self, response):

    subject_id = response.css('::attr(id)').extract_first().strip()

    for course in response.css('course'):

        course_name = course.css('course::text').extract_first().strip()
        course_link = course.css('course::attr(href)').extract_first().strip()
        course_id = course.css('course::attr(id)').extract_first().strip()

        if course_link is not None:
            course_data = scrapy.Request(course_link, callback=self.parse_class)

        yield {
            'course_name': course_name,
            'course_link': course_link,
            'course_id': subject_id + " " + course_id,
            'course_data': course_data,
        }

def parse_class(self, response):

    course_id = response.css('::attr(id)').extract_first().strip()

    for section in response.css('section'):
        section_name = section.css('section::text').extract_first().strip()
        section_link = section.css('section::attr(href)').extract_first().strip()

        yield {
            'section_name': section_name,
            'section_link': section_link,
            'course_id': course_id,
        }

I'd like to get an output json file that has a tree structure like so:

{"subject_id": "...", "subject_name": "...", "subject_link": "...", "subject_data": 
  {"course_id": "...", "course_link": "...", "course_name": "...", "course_data": 
    {"course_id": "...", "section_link": "...", "section_name": "..."}
  }
}

However i only get this:

{"subject_id": "...", "subject_data": "<Request GET http://example.com/something>", "subject_name": "...", "subject_link": "..."}

From my understanding this is because the yield code didn't get executed yet. How would i go about calling a equivalent of "scrapy crawl courses -o courses.json" that fully calls all requests? If that's not possible out-of-the-box how can i do this myself? Can i later import the json in a python file and run http://example.com/something> and the following ones somehow?

I know there's a lot of code, but it should clarify. Thanks for your help!


Solution

  • I see 2 ways of doing this:

    1. either build the data incrementally, and pass the data to each callback using Request.meta dict. See Passing additional data to callback functions

    or

    1. use something like scrapy-inline-requests (to be tested)

    Method 1.

    class CoursesSpider(scrapy.Spider):
        name = "courses"
        start_urls = [
            'http://example.com'
        ]
    
        def parse(self, response):
            for subject in response.css('subject'):
    
                subject_name = subject.css('subject::text').extract_first().strip()
                subject_link = subject.css('subject::attr(href)').extract_first().strip()
                subject_id = subject.css('subject::attr(id)').extract_first().strip()
    
                if subject_link is not None:
                    subject_data = scrapy.Request(subject_link, callback=self.parse_course)
    
                # build a dict with the info we have so far
                subject_info = {
                    'subject_name': subject_name,
                    'subject_link': subject_link,
                    'subject_id': subject_id,
                }
                # add this to the Request's meta dict
                subject_data.meta['subject_info'] = subject_info
    
                # ask Scrapy to fetch additional data
                yield subject_data
    
        def parse_course(self, response):
    
            # get back the data that was passed previously
            subject_info = response.request.meta['subject_info']
    
            subject_id = response.css('::attr(id)').extract_first().strip()
    
            for course in response.css('course'):
    
                course_name = course.css('course::text').extract_first().strip()
                course_link = course.css('course::attr(href)').extract_first().strip()
                course_id = course.css('course::attr(id)').extract_first().strip()
    
                if course_link is not None:
                    course_data = scrapy.Request(course_link, callback=self.parse_class)
    
                # build a dict with the data in this page
                # + the data scraped previously
                course_info = {
                    'course_name': course_name,
                    'course_link': course_link,
                    'course_id': subject_id + " " + course_id,
                    'subject_info': subject_info,
                }
    
                # pass that data to the next callback
                course_data.meta['course_info'] = subject_info
    
                # fetch the class page
                yield course_data
    
        def parse_class(self, response):
    
            # get course data from previous callbacks
            course_info = response.request.meta['course_info']
    
            course_id = response.css('::attr(id)').extract_first().strip()
    
            for section in response.css('section'):
                section_name = section.css('section::text').extract_first().strip()
                section_link = section.css('section::attr(href)').extract_first().strip()
    
                yield {
                    'section_name': section_name,
                    'section_link': section_link,
                    'course_id': course_id,
                    'course_info': course_info
                }
    

    So you will not get subjects containing courses, themselves containing sections, rather sections, each having info on what courses they belong to, themselves having info on which subject they relate to.

    Method 2. (Warning: I have not tested this in practice but it may work)

    from inline_requests import inline_requests
    
    class CoursesSpider(scrapy.Spider):
        name = "courses"
        start_urls = [
            'http://example.com'
        ]
    
        # this decorator is important
        @inline_requests
        def parse(self, response):
    
            for subject in response.css('subject'):
    
                subject_name = subject.css('subject::text').extract_first().strip()
                subject_link = subject.css('subject::attr(href)').extract_first().strip()
                subject_id = subject.css('subject::attr(id)').extract_first().strip()
    
                # this list will collect information on courses for this subject
                subject_data = []
    
                if subject_link is not None:
                    try:
                        # you ask scrapy to fetch the page
                        # but you do not set a callback
                        subject_response = yield scrapy.Request(subject_link)
                        # and you get a Response to work on when it's fetched,
                        # without going through a callback
    
                        subject_id = subject_response.css('::attr(id)').extract_first().strip()
    
                        for course in subject_response.css('course'):
    
                            course_name = course.css('course::text').extract_first().strip()
                            course_link = course.css('course::attr(href)').extract_first().strip()
                            course_id = course.css('course::attr(id)').extract_first().strip()
    
                            # this list will collect information on sections for this course
                            course_data = []
                            if course_link is not None:
                                try:
                                    # same thing here, you ask Scrapy to fetch a Response
                                    course_response = yield scrapy.Request(course_link)
    
                                    course_id = course_response.css('::attr(id)').extract_first().strip()
    
                                    for section in course_response.css('section'):
                                        section_name = section.css('section::text').extract_first().strip()
                                        section_link = section.css('section::attr(href)').extract_first().strip()
    
                                        # add each section item
                                        course_data.append(
                                            {
                                                'section_name': section_name,
                                                'section_link': section_link,
                                                'course_id': course_id,
                                            }
                                        )
    
                                except:
                                    raise
    
                            # add each course item
                            subject_data.append(
                                {
                                    'course_name': course_name,
                                    'course_link': course_link,
                                    'course_id': subject_id + " " + course_id,
                                    'course_data': course_data,
                                }
                            )
    
                    except:
                        raise
    
    
                yield {
                    'subject_name': subject_name,
                    'subject_link': subject_link,
                    'subject_id': subject_id,
                    'subject_data': subject_data,
                }