Search code examples
pythonbeautifulsoupcss-selectorsurllib

Why select_one function in Beautifulsoup returns None value


I'm using python 3 and Beautifulsoup

My following code returns a list of none values

import bs4 as bs
import urllib.request
import pandas as pd
from requests_html import HTMLSession


review_dict = {'review':[], 'author':[]}

page = 1

while page != 10:
    session = HTMLSession()

    url = 'https://www.goodreads.com/book/show/2932708?from_search=true&from_srp=true&qid=OOQwYQkG9A&rank=1' + str(++page)

    grURL = session.get(url)

    soup = bs.BeautifulSoup(grURL.content, 'html.parser')
    prod_containers = soup.find('div', id = 'lazy_loadable_view')
    firstelement = prod_containers.find_all('div', attrs={'class': 'left bodycol'})
    
    for rows in firstelement:
        review = rows.select_one('p > div.reviewText stacked > span.readable > span')
        author = rows.select_one('p > div.reviewHeader uitext stacked > span > a[title]')
        review_dict['review'].append(review)
        review_dict['author'].append(author)

    if page == 10:
        break
    page += 1
sword_reviews = pd.DataFrame(review_dict)
sword_reviews

And when I use .text function Jupiter notebook gives me this error:

AttributeError: 'NoneType' object has no attribute 'text'

How to adjust my code to scrape reviews and reviewer names correctly?


Solution

  • To get the data you want, you need to change the search string in select_one. Join multiple classes together with '.'

    Try this code:

    import bs4 as bs
    import urllib.request
    import pandas as pd
    from requests_html import HTMLSession
    
    
    review_dict = {'review':[], 'author':[]}
    
    page = 1
    
    while page != 10:
        session = HTMLSession()
    
        url = 'https://www.goodreads.com/book/show/2932708?from_search=true&from_srp=true&qid=OOQwYQkG9A&rank=1' + str(++page)
    
        grURL = session.get(url)
    
        soup = bs.BeautifulSoup(grURL.content, 'html.parser')
        prod_containers = soup.find('div', id = 'lazy_loadable_view')
        firstelement = prod_containers.find_all('div', attrs={'class': 'left bodycol'})
        
        for rows in firstelement:
            review = rows.select_one('div.reviewText.stacked > span.readable > span')
            author = rows.select_one('div.reviewHeader.uitext.stacked > span > a[title]')
            review_dict['review'].append(review)
            review_dict['author'].append(author)
    
        if page == 10:
            break
        page += 1
        
    sword_reviews = pd.DataFrame(review_dict)
    
    print(sword_reviews)