I'd like to scrape, using Python 3.6, H3 titles from within DIV - from the page:
https://player.bfi.org.uk/search/rentals?q=&sort=title&page=1
Note that the page number changes, increment of 1.
I'm struggling to return or identify the title.
from requests import get
url = 'https://player.bfi.org.uk/search/rentals?q=&sort=title&page=1'
response = get(url)
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'lxml')
type(html_soup)
movie_containers = html_soup.find_all('div', class_ = 'card card--rentals')
print(type(movie_containers))
print(len(movie_containers))
I've tried looping through them also:
for dd in page("div.card__content"):
print(div.select_one("h3.card__title").text.strip())
Any help would be great.
Thanks,
I'm expecting results of Title of each film from each page, including link to the film. Eg. https://player.bfi.org.uk/rentals/film/watch-akenfield-1975-online
The page is loading content via xhr to another url so you are missing this. You can mimic that xhr POST request the page uses and alter post json sent. If you change size
you get more results.
import requests
data = {"size":1480,"from":0,"sort":"sort_title","aggregations":{"genre":{"terms":{"field":"genre.raw","size":10}},"captions":{"terms":{"field":"captions"}},"decade":{"terms":{"field":"decade.raw","order":{"_term":"asc"},"size":20}},"bbfc":{"terms":{"field":"bbfc_rating","size":10}},"english":{"terms":{"field":"english"}},"audio_desc":{"terms":{"field":"audio_desc"}},"colour":{"terms":{"field":"colour"}},"mono":{"terms":{"field":"mono"}},"fiction":{"terms":{"field":"fiction"}}},"min_score":0.5,"query":{"bool":{"must":{"match_all":{}},"must_not":[],"should":[],"filter":{"term":{"pillar.raw":"rentals"}}}}}
r = requests.post('https://search-es.player.bfi.org.uk/prod-films/_search', json = data).json()
for film in r['hits']['hits']:
print(film['_source']['title'], 'https://player.bfi.org.uk' + film['_source']['url'])
The actual result count for rentals
is in the json, r['hits']['total']
, so you can do an initial request, starting with a number much higher than you expect, check if another request is needed, and then gather any extra by altering the from
and size
to mop up any outstanding.
import requests
import pandas as pd
initial_count = 10000
results = []
def add_results(r):
for film in r['hits']['hits']:
results.append([film['_source']['title'], 'https://player.bfi.org.uk' + film['_source']['url']])
with requests.Session() as s:
data = {"size": initial_count,"from":0,"sort":"sort_title","aggregations":{"genre":{"terms":{"field":"genre.raw","size":10}},"captions":{"terms":{"field":"captions"}},"decade":{"terms":{"field":"decade.raw","order":{"_term":"asc"},"size":20}},"bbfc":{"terms":{"field":"bbfc_rating","size":10}},"english":{"terms":{"field":"english"}},"audio_desc":{"terms":{"field":"audio_desc"}},"colour":{"terms":{"field":"colour"}},"mono":{"terms":{"field":"mono"}},"fiction":{"terms":{"field":"fiction"}}},"min_score":0.5,"query":{"bool":{"must":{"match_all":{}},"must_not":[],"should":[],"filter":{"term":{"pillar.raw":"rentals"}}}}}
r = s.post('https://search-es.player.bfi.org.uk/prod-films/_search', json = data).json()
total_results = int(r['hits']['total'])
add_results(r)
if total_results > initial_count :
data['size'] = total_results - initial_count
data['from'] = initial_count
r = s.post('https://search-es.player.bfi.org.uk/prod-films/_search', json = data).json()
add_results(r)
df = pd.DataFrame(results, columns = ['Title', 'Link'])
print(df.head())