This code scraps IMDb website (gets movie's titles, years, ranks, etc) with XPath
expressions and returns their result but its selector()
function only runs once correctly then breaks. How do I fix it?
#!/usr/bin/env python3
import lxml.html
import requests as rq
class IMDBParser(object):
def __init__(self, role_type=None, sort_type='user_rating, desc',
job_type="actor", title_type="movie"):
self.job_type = job_type
self.sort_type = sort_type
self.title_type = title_type
self.role_type = role_type
self.params = {
'page': 0,
'sort': sort_type,
'role': role_type,
'job_type': job_type,
'title_type': title_type
def identity(self):
"""gets actor's name and extracts its id from
imdb website."""
response = rq.get(IMDB_JSON, params={'json': 1, 'nm': 'one',
'q': rq.compat.quote_plus(self.role_type)})
movie_dicts = response.json()
return movie_dicts.get('name_popular', 'name_approx')[0]['id']
def selector(self, expr):
"""gets an expression and extracts all matched then
returns a generator of each matching value."""
self.params['role'] = self.identity()
while True:
self.params['page'] += 1
response = rq.get(IMDB_HTML, params=self.params)
elements = lxml.html.fromstring(response.text).xpath(expr)
if not elements:
yield from (element.text for element in elements)
class IMDBApplication(IMDBParser):
def __init__(self, role_type=None, sort_type='user_rating, desc',
job_type="actor", title_type="movie"):
self.job_type = job_type
self.sort_type = sort_type
self.title_type = title_type
self.role_type = role_type
def get_titles(self):
"""passes the xpath expression to the function and gets
its return."""
expr = "//*/div/div[2]/div[3]/div/div[2]/h3/a[1]"
return self.selector(expr)
def get_scores(self):
"""passes the xpath expression to the function and gets
its return."""
expr = "//*/div[2]/div[3]/div/div[2]/div/div[1]/strong"
return self.selector(expr)
def get_years(self):
"""passes the xpath expression to the function and gets
its return."""
expr = "//*/div/div[2]/div[3]/div/div[2]/h3/span[2]"
return self.selector(expr)
if __name__ == "__main__":
ia1 = IMDBApplication("Daniel Craig")
print([i for i in ia1.get_titles()])
print([i for i in ia1.get_scores()])
The problem is:
self.params['page'] += 1
One the first request, you increment the page number until you don't get any results. However, you never reset it. If you change your selector
function to:
def selector(self, expr):
"""gets an expression and extracts all matched then
returns a generator of each matching value."""
self.params['role'] = self.identity()
while True:
self.params['page'] += 1
response = rq.get(IMDB_HTML, params=self.params)
elements = lxml.html.fromstring(response.text).xpath(expr)
if not elements:
yield from (element.text for element in elements)
self.params['page'] = 0
It works fine, giving:
['Casino Royale', 'The Girl with the Dragon Tattoo', 'One Life', 'Skyfall', 'Road to Perdition', 'Munich', 'Elizabeth', 'Layer Cake', 'The Adventures of Tintin: The Secret of the Unicorn', 'Defiance', 'The Power of One', 'The Jacket', 'Infamous', 'Sorstalanság', 'The Mother', 'Flashbacks of a Fool', 'Renaissance', 'Ten Minutes Older: The Cello', 'Quantum of Solace', 'Some Voices', 'Love Is the Devil: Study for a Portrait of Francis Bacon', 'Hotel Splendide', 'Enduring Love', 'Sylvia', 'The Golden Compass', 'Cowboys & Aliens', 'The Trench', 'Dream House', 'The Invasion', 'Lara Croft: Tomb Raider', 'I Dreamed of Africa', 'Obsession', 'Love & Rage', 'Saint-Ex', "A Kid in King Arthur's Court", 'Spectre', 'The Girl Who Played with Fire', 'Bond 25', "The Girl Who Kicked the Hornets' Nest"]
['8.0', '7.9', '7.9', '7.8', '7.7', '7.6', '7.5', '7.4', '7.4', '7.2', '7.2', '7.1', '7.1', '7.1', '6.9', '6.8', '6.8', '6.8', '6.7', '6.7', '6.6', '6.5', '6.4', '6.3', '6.1', '6.1', '6.1', '5.9', '5.9', '5.7', '5.5', '5.3', '5.3', '5.1', '4.7']