Search code examples
pythonpython-3.xxpathimdb

Why this function runs correctly only once, and then returns an empty list?


This code scraps IMDb website (gets movie's titles, years, ranks, etc) with XPath expressions and returns their result but its selector() function only runs once correctly then breaks. How do I fix it?

#!/usr/bin/env python3
import lxml.html
import requests as rq


IMDB_HTML = "http://www.imdb.com/filmosearch"
IMDB_JSON = "http://www.imdb.com/xml/find"


class IMDBParser(object):
    def __init__(self, role_type=None, sort_type='user_rating, desc',
                 job_type="actor", title_type="movie"):
        self.job_type = job_type
        self.sort_type = sort_type
        self.title_type = title_type
        self.role_type = role_type
        self.params = {
            'page': 0,
            'sort': sort_type,
            'role': role_type,
            'job_type': job_type,
            'title_type': title_type
        }

    def identity(self):
        """gets actor's name and extracts its id from
        imdb website."""
        response = rq.get(IMDB_JSON, params={'json': 1, 'nm': 'one',
                          'q': rq.compat.quote_plus(self.role_type)})
        movie_dicts = response.json()
        return movie_dicts.get('name_popular', 'name_approx')[0]['id']

    def selector(self, expr):
        """gets an expression and extracts all matched then
        returns a generator of each matching value."""
        self.params['role'] = self.identity()
        while True:
            self.params['page'] += 1
            response = rq.get(IMDB_HTML, params=self.params)
            elements = lxml.html.fromstring(response.text).xpath(expr)
            if not elements:
                break
            yield from (element.text for element in elements)


class IMDBApplication(IMDBParser):
    def __init__(self, role_type=None, sort_type='user_rating, desc',
                 job_type="actor", title_type="movie"):
        IMDBParser.__init__(self)
        self.job_type = job_type
        self.sort_type = sort_type
        self.title_type = title_type
        self.role_type = role_type

    def get_titles(self):
        """passes the xpath expression to the function and gets
        its return."""
        expr = "//*/div/div[2]/div[3]/div/div[2]/h3/a[1]"
        return self.selector(expr)

    def get_scores(self):
        """passes the xpath expression to the function and gets
        its return."""
        expr = "//*/div[2]/div[3]/div/div[2]/div/div[1]/strong"
        return self.selector(expr)

    def get_years(self):
        """passes the xpath expression to the function and gets
        its return."""
        expr = "//*/div/div[2]/div[3]/div/div[2]/h3/span[2]"
        return self.selector(expr)

if __name__ == "__main__":
    ia1 = IMDBApplication("Daniel Craig")
    print([i for i in ia1.get_titles()])
    print([i for i in ia1.get_scores()])

Solution

  • The problem is:

    self.params['page'] += 1
    

    One the first request, you increment the page number until you don't get any results. However, you never reset it. If you change your selector function to:

    def selector(self, expr):
        """gets an expression and extracts all matched then
        returns a generator of each matching value."""
        self.params['role'] = self.identity()
        while True:
            self.params['page'] += 1
            response = rq.get(IMDB_HTML, params=self.params)
            elements = lxml.html.fromstring(response.text).xpath(expr)
            if not elements:
                break
            yield from (element.text for element in elements)
        self.params['page'] = 0
    

    It works fine, giving:

    ['Casino Royale', 'The Girl with the Dragon Tattoo', 'One Life', 'Skyfall', 'Road to Perdition', 'Munich', 'Elizabeth', 'Layer Cake', 'The Adventures of Tintin: The Secret of the Unicorn', 'Defiance', 'The Power of One', 'The Jacket', 'Infamous', 'Sorstalanság', 'The Mother', 'Flashbacks of a Fool', 'Renaissance', 'Ten Minutes Older: The Cello', 'Quantum of Solace', 'Some Voices', 'Love Is the Devil: Study for a Portrait of Francis Bacon', 'Hotel Splendide', 'Enduring Love', 'Sylvia', 'The Golden Compass', 'Cowboys & Aliens', 'The Trench', 'Dream House', 'The Invasion', 'Lara Croft: Tomb Raider', 'I Dreamed of Africa', 'Obsession', 'Love & Rage', 'Saint-Ex', "A Kid in King Arthur's Court", 'Spectre', 'The Girl Who Played with Fire', 'Bond 25', "The Girl Who Kicked the Hornets' Nest"]
    ['8.0', '7.9', '7.9', '7.8', '7.7', '7.6', '7.5', '7.4', '7.4', '7.2', '7.2', '7.1', '7.1', '7.1', '6.9', '6.8', '6.8', '6.8', '6.7', '6.7', '6.6', '6.5', '6.4', '6.3', '6.1', '6.1', '6.1', '5.9', '5.9', '5.7', '5.5', '5.3', '5.3', '5.1', '4.7']