I'm using python 3 and Beautifulsoup
My following code returns a list of none values
import bs4 as bs
import urllib.request
import pandas as pd
from requests_html import HTMLSession
review_dict = {'review':[], 'author':[]}
page = 1
while page != 10:
session = HTMLSession()
url = 'https://www.goodreads.com/book/show/2932708?from_search=true&from_srp=true&qid=OOQwYQkG9A&rank=1' + str(++page)
grURL = session.get(url)
soup = bs.BeautifulSoup(grURL.content, 'html.parser')
prod_containers = soup.find('div', id = 'lazy_loadable_view')
firstelement = prod_containers.find_all('div', attrs={'class': 'left bodycol'})
for rows in firstelement:
review = rows.select_one('p > div.reviewText stacked > span.readable > span')
author = rows.select_one('p > div.reviewHeader uitext stacked > span > a[title]')
review_dict['review'].append(review)
review_dict['author'].append(author)
if page == 10:
break
page += 1
sword_reviews = pd.DataFrame(review_dict)
sword_reviews
And when I use .text
function Jupiter notebook gives me this error:
AttributeError: 'NoneType' object has no attribute 'text'
How to adjust my code to scrape reviews and reviewer names correctly?
To get the data you want, you need to change the search string in select_one
. Join multiple classes together with '.'
Try this code:
import bs4 as bs
import urllib.request
import pandas as pd
from requests_html import HTMLSession
review_dict = {'review':[], 'author':[]}
page = 1
while page != 10:
session = HTMLSession()
url = 'https://www.goodreads.com/book/show/2932708?from_search=true&from_srp=true&qid=OOQwYQkG9A&rank=1' + str(++page)
grURL = session.get(url)
soup = bs.BeautifulSoup(grURL.content, 'html.parser')
prod_containers = soup.find('div', id = 'lazy_loadable_view')
firstelement = prod_containers.find_all('div', attrs={'class': 'left bodycol'})
for rows in firstelement:
review = rows.select_one('div.reviewText.stacked > span.readable > span')
author = rows.select_one('div.reviewHeader.uitext.stacked > span > a[title]')
review_dict['review'].append(review)
review_dict['author'].append(author)
if page == 10:
break
page += 1
sword_reviews = pd.DataFrame(review_dict)
print(sword_reviews)