Search code examples
pythonselenium-webdriverweb-scraping

Selenium-Finding and Clicking a Show More button continually in :before section


See latest update at bottom

Using Python 3.12.3 and Selenium, I'm trying to load more rows before scraping and am very new to the process. Ideally all or at least as many as possible, but the website may autolimit the total reviews on page at some point. By manually clicking, I have been able to click it atleast 10 more times without it failing. Any help would be appreciated. Please let me know if I can provide any other context.

Here is a screenshot of the section I'd like to click: Show More Button I'm consistently unable to click this section and I believe it's due to the :before psuedo-element, but additionally, there are a number of div.Button__containers making my attempts to click usually result in pressing buttons I don't mean to.

Here is the script I'm using to pull the reviews, but it's unable to actually click load more rows, so I'm hoping to bridge that gap using selenium:

import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium import webdriver
from IPython.display import display, Image

driver = webdriver.Chrome()
title = driver.title

for i in range(0,1000,100):
    #McCurdy - I'm Glad my Mom Died
    response = requests.get('https://www.goodreads.com/book/show/26074156/reviews?reviewFilters={%22workId%22:%22kca://work/amzn1.gr.work.v1.FSsY8ohzUZCeEXoBsiEYqw%22,%22after%22:%22NjgxNSwxNTAwNjU3MjE4NDI1%22}')
print(response.status_code)

time.sleep(6)

doc = BeautifulSoup(response.text, 'html.parser')

df = pd.DataFrame(columns=['ReviewDate','ReviewerName','Rating','ReviewText','ReviewerMeta'])

ratings = []

# Loop through all elements found
for tag in book_tags_:
    # Get the aria-label attribute from the current element
    aria_label = tag.get('aria-label')
    
    # Check if aria-label is not None and contains the expected format
    if aria_label and 'Rating ' in aria_label and ' out of 5' in aria_label:
        # Split the aria-label to extract the desired text
        rating_text = aria_label.split('Rating ')[1].split(' out of 5')[0]
        
        # Append the rating_text to the list of ratings
        ratings.append(rating_text)
    else:
        print(f"Skipping element with aria-label: {aria_label}")

# Create a dataframe from the list of ratings
df = pd.DataFrame({'Rating': ratings})

#This is repeated for additional fields

Here is my code that I'm using to attempt a button press:

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, ElementClickInterceptedException

# Initialize WebDriver
driver = webdriver.Chrome()  # or another driver you're using

url = "https://www.goodreads.com/book/show/59364173/reviews?reviewFilters={%22workId%22:%22kca://work/amzn1.gr.work.v3.JeHZlXvg2e1mD9_k%22,%22after%22:%22MjYwMTYsMTY2MDc1MjY5MjY2Mw%22}"

# Open the page
driver.get(url)
time.sleep(20)

def click_show_more():
    while True:
        try:
            # Scroll to the bottom of the page to ensure button is in view
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            
            # Find the 'Show More' button
            show_more_button = WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.Divider.Divider--contents.Divider--largeMargin > div.Button__container'))
            )
            
            # Scroll the element into view
            driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
            
            # Ensure the element is clickable
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.Divider.Divider--contents.Divider--largeMargin > div.Button__container')))
            
            # Try to click the button using JavaScript if needed
            driver.execute_script("arguments[0].click();", show_more_button)
            
            # Optionally, wait for some condition after clicking, e.g., new content to load
            WebDriverWait(driver, 10).until(EC.staleness_of(show_more_button))
        
        except TimeoutException:
            print("No more 'Show More' button found or timed out.")
            break
        except StaleElementReferenceException:
            print("StaleElementReferenceException: Trying to find the button again.")
            continue
        except ElementClickInterceptedException:
            print("ElementClickInterceptedException: Element is being obstructed.")
            continue
        except Exception as e:
            print(f"Error clicking 'Show More' button: {e}")
            break

# Remember to call your function
click_show_more()

After @x1337loser 's comment, I added an append to list section to the getdata() function and after which I started getting the following error: Link to new code error

I also removed user image from the print out and added a data dictionary and write to csv to the end of the script, but these steps do not appear to be causing any issues.

See new code below:

import time
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup
import pandas as pd

ReviewText = []
ratings = []
Reviewer = []
reviewdt = []
ReviewerId = []
Author_ID = []

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

def get_data(content):
    data = content['data']['getReviews']['edges']
    for i in data:
        id_user = i['node']['creator']['id']
        rev_img = i['node']['creator']['imageUrlSquare']
        is_author = i['node']['creator']['isAuthor']
        follower_count = i['node']['creator']['followersCount']
        name = i['node']['creator']['name']
        review = BeautifulSoup(i['node']['text'], "lxml").text #removed HTML tag from text
        review_create_data = i['node']['createdAt']
        result_ms = pandas.to_datetime(review_create_data,unit='ms') #decode timestamp 
        review_liked = i['node']['likeCount']
        rating = i['node']['rating']
                                                                                                                                   #\nreviewer_image:  {rev_img} between resultms and \nreviewer_follower
        print(f"reviewers_name:  {name}\nreviewer_user_id:  {id_user}\nIs reviewr is author:  {is_author}\nreview_date:  {result_ms}\nreviewer_follower:  {follower_count}\nreview:  {review}\nreview_ratings:  {rating}\nreview_liked:  {review_liked}\n========================================")

        #print(f"Is reviewr is author:  {is_author}\nreviewer_follower:  {follower_count}\nreview_liked:  {review_liked}\n========================================")


        ReviewText_content = {review}
        rating_text = {rating}
        Reviewer_name = {name}
        reviewdt_content = {result_ms}
        reviewer_id = {id_user}
        author = {is_author}
        
        # Append the Reviewer_text to the list of ratings
        Reviewer.append(Reviewer_name)
        reviewdt.append(reviewdt_content)
        ratings.append(rating_text)    
        ReviewText.append(ReviewText_content)
        ReviewerId.append(reviewer_id)
        Author_ID.append(author)


def gatherNextPage(resourceId):
    url = "https://kxbwmqov6jgg3daaamb744ycu4.appsync-api.us-east-1.amazonaws.com/graphql"
    headers = {
        "User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0",
        "X-Api-Key": "da2-xpgsdydkbregjhpr6ejzqdhuwy" # The Server added this API KEY automatically and strict checking happened from the client-server request
    }
    data = {
    "operationName": "getReviews",
    "variables": {
        "filters": {
        "resourceType": "WORK",
        "resourceId": f"{resourceId}"
        },
        "pagination": {
        "limit": 100
        }
    },
    "query": "query getReviews($filters: BookReviewsFilterInput!, $pagination: PaginationInput) {\n  getReviews(filters: $filters, pagination: $pagination) {\n    ...BookReviewsFragment\n    __typename\n  }\n}\n\nfragment BookReviewsFragment on BookReviewsConnection {\n  totalCount\n  edges {\n    node {\n      ...ReviewCardFragment\n      __typename\n    }\n    __typename\n  }\n  pageInfo {\n    prevPageToken\n    nextPageToken\n    __typename\n  }\n  __typename\n}\n\nfragment ReviewCardFragment on Review {\n  __typename\n  id\n  creator {\n    ...ReviewerProfileFragment\n    __typename\n  }\n  recommendFor\n  updatedAt\n  createdAt\n  spoilerStatus\n  lastRevisionAt\n  text\n  rating\n  shelving {\n    shelf {\n      name\n      webUrl\n      __typename\n    }\n    taggings {\n      tag {\n        name\n        webUrl\n        __typename\n      }\n      __typename\n    }\n    webUrl\n    __typename\n  }\n  likeCount\n  viewerHasLiked\n  commentCount\n}\n\nfragment ReviewerProfileFragment on User {\n  id: legacyId\n  imageUrlSquare\n  isAuthor\n  ...SocialUserFragment\n  textReviewsCount\n  viewerRelationshipStatus {\n    isBlockedByViewer\n    __typename\n  }\n  name\n  webUrl\n  contributor {\n    id\n    works {\n      totalCount\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment SocialUserFragment on User {\n  viewerRelationshipStatus {\n    isFollowing\n    isFriend\n    __typename\n  }\n  followersCount\n  __typename\n}\n"
    }
    n, data_next = 1, data
    while True:
        time.sleep(3)
        resp = requests.post(url, headers=headers, json=data_next, verify=False)
        data = resp.json()
        get_data(data)
        nextPageToken = data['data']['getReviews']['pageInfo']['nextPageToken']
        if not nextPageToken:
            break

        data_next = {
        "operationName": "getReviews",
        "variables": {
            "filters": {
            "resourceType": "WORK",
            "resourceId": f"{resourceId}"
            },
            "pagination": {
            "after": f"{nextPageToken}", #nextPageToken from response data, and the total limit is 100 results per request.
            "limit": 100
            }
        },
        "query": "query getReviews($filters: BookReviewsFilterInput!, $pagination: PaginationInput) {\n  getReviews(filters: $filters, pagination: $pagination) {\n    ...BookReviewsFragment\n    __typename\n  }\n}\n\nfragment BookReviewsFragment on BookReviewsConnection {\n  totalCount\n  edges {\n    node {\n      ...ReviewCardFragment\n      __typename\n    }\n    __typename\n  }\n  pageInfo {\n    prevPageToken\n    nextPageToken\n    __typename\n  }\n  __typename\n}\n\nfragment ReviewCardFragment on Review {\n  __typename\n  id\n  creator {\n    ...ReviewerProfileFragment\n    __typename\n  }\n  recommendFor\n  updatedAt\n  createdAt\n  spoilerStatus\n  lastRevisionAt\n  text\n  rating\n  shelving {\n    shelf {\n      name\n      webUrl\n      __typename\n    }\n    taggings {\n      tag {\n        name\n        webUrl\n        __typename\n      }\n      __typename\n    }\n    webUrl\n    __typename\n  }\n  likeCount\n  viewerHasLiked\n  commentCount\n}\n\nfragment ReviewerProfileFragment on User {\n  id: legacyId\n  imageUrlSquare\n  isAuthor\n  ...SocialUserFragment\n  textReviewsCount\n  viewerRelationshipStatus {\n    isBlockedByViewer\n    __typename\n  }\n  name\n  webUrl\n  contributor {\n    id\n    works {\n      totalCount\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment SocialUserFragment on User {\n  viewerRelationshipStatus {\n    isFollowing\n    isFriend\n    __typename\n  }\n  followersCount\n  __typename\n}\n"
        }

gatherNextPage("kca://work/amzn1.gr.work.v3.JeHZlXvg2e1mD9_k")

data_dict = {
    'ReviewDate': reviewdt,
    'ReviewerName': Reviewer,
    'Rating': ratings,
    'ReviewText': ReviewText,
    'ReviewerID': ReviewerId,
    'IsAuthor': Author_ID

    #'ReviewerMeta': reviewer_meta
}



df = pd.DataFrame(data_dict)
df. to_csv('McCurdy_Sample_Sept30.csv', index=False) 

Solution

  • Note: This answer contains a different method to reach your goal. (Module used: requests, bs4, pandas, time)

    Hi there,

    Based on your question I think you are trying to fetch all the information related to review data. well, I found out there is a better way of solution than using selenium, here is my mindmap:-

    Your target app has a Graphql API endpoint https://kxbwmqov6jgg3daaamb744ycu4.appsync-api.us-east-1.amazonaws.com/graphql which is fetching all the review details from the server, so if we send a request to specifying a resourseId on POST body to this endpoint we can easily get those data with the help of python request library and a little bit of coding. here is my code:

    Note: to avoid rate limiting I used time.sleep(3) to minimize the thread.

    import time
    import requests
    from requests.packages.urllib3.exceptions import InsecureRequestWarning
    from bs4 import BeautifulSoup
    import pandas
    
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
    
    def get_data(content):
        data = content['data']['getReviews']['edges']
        for i in data:
            id_user = i['node']['creator']['id']
            rev_img = i['node']['creator']['imageUrlSquare']
            is_author = i['node']['creator']['isAuthor']
            follower_count = i['node']['creator']['followersCount']
            name = i['node']['creator']['name']
            review = BeautifulSoup(i['node']['text'], "lxml").text #removed HTML tag from text
            review_create_data = i['node']['createdAt']
            result_ms = pandas.to_datetime(review_create_data,unit='ms') #decode timestamp 
            review_liked = i['node']['likeCount']
            rating = i['node']['rating']
            
            print(f"reviewers_name:  {name}\nreviewer_user_id:  {id_user}\nIs reviewr is author:  {is_author}\nreview_date:  {result_ms}\nreviewer_image:  {rev_img}\nreviewer_follower:  {follower_count}\nreview:  {review}\nreview_ratings:  {rating}\nreview_liked:  {review_liked}\n========================================")
    
    def gatherNextPage(resourceId):
        url = "https://kxbwmqov6jgg3daaamb744ycu4.appsync-api.us-east-1.amazonaws.com/graphql"
        headers = {
            "User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0",
            "X-Api-Key": "da2-xpgsdydkbregjhpr6ejzqdhuwy" # The Server added this API KEY automatically and strict checking happened from the client-server request
        }
        data = {
        "operationName": "getReviews",
        "variables": {
            "filters": {
            "resourceType": "WORK",
            "resourceId": f"{resourceId}"
            },
            "pagination": {
            "limit": 100
            }
        },
        "query": "query getReviews($filters: BookReviewsFilterInput!, $pagination: PaginationInput) {\n  getReviews(filters: $filters, pagination: $pagination) {\n    ...BookReviewsFragment\n    __typename\n  }\n}\n\nfragment BookReviewsFragment on BookReviewsConnection {\n  totalCount\n  edges {\n    node {\n      ...ReviewCardFragment\n      __typename\n    }\n    __typename\n  }\n  pageInfo {\n    prevPageToken\n    nextPageToken\n    __typename\n  }\n  __typename\n}\n\nfragment ReviewCardFragment on Review {\n  __typename\n  id\n  creator {\n    ...ReviewerProfileFragment\n    __typename\n  }\n  recommendFor\n  updatedAt\n  createdAt\n  spoilerStatus\n  lastRevisionAt\n  text\n  rating\n  shelving {\n    shelf {\n      name\n      webUrl\n      __typename\n    }\n    taggings {\n      tag {\n        name\n        webUrl\n        __typename\n      }\n      __typename\n    }\n    webUrl\n    __typename\n  }\n  likeCount\n  viewerHasLiked\n  commentCount\n}\n\nfragment ReviewerProfileFragment on User {\n  id: legacyId\n  imageUrlSquare\n  isAuthor\n  ...SocialUserFragment\n  textReviewsCount\n  viewerRelationshipStatus {\n    isBlockedByViewer\n    __typename\n  }\n  name\n  webUrl\n  contributor {\n    id\n    works {\n      totalCount\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment SocialUserFragment on User {\n  viewerRelationshipStatus {\n    isFollowing\n    isFriend\n    __typename\n  }\n  followersCount\n  __typename\n}\n"
        }
        n, data_next = 1, data
        while True:
            time.sleep(3)
            resp = requests.post(url, headers=headers, json=data_next, verify=False)
            data = resp.json()
            get_data(data)
            nextPageToken = data['data']['getReviews']['pageInfo']['nextPageToken']
            if not nextPageToken:
                break
    
            data_next = {
            "operationName": "getReviews",
            "variables": {
                "filters": {
                "resourceType": "WORK",
                "resourceId": f"{resourceId}"
                },
                "pagination": {
                "after": f"{nextPageToken}", #nextPageToken from response data, and the total limit is 100 results per request.
                "limit": 100
                }
            },
            "query": "query getReviews($filters: BookReviewsFilterInput!, $pagination: PaginationInput) {\n  getReviews(filters: $filters, pagination: $pagination) {\n    ...BookReviewsFragment\n    __typename\n  }\n}\n\nfragment BookReviewsFragment on BookReviewsConnection {\n  totalCount\n  edges {\n    node {\n      ...ReviewCardFragment\n      __typename\n    }\n    __typename\n  }\n  pageInfo {\n    prevPageToken\n    nextPageToken\n    __typename\n  }\n  __typename\n}\n\nfragment ReviewCardFragment on Review {\n  __typename\n  id\n  creator {\n    ...ReviewerProfileFragment\n    __typename\n  }\n  recommendFor\n  updatedAt\n  createdAt\n  spoilerStatus\n  lastRevisionAt\n  text\n  rating\n  shelving {\n    shelf {\n      name\n      webUrl\n      __typename\n    }\n    taggings {\n      tag {\n        name\n        webUrl\n        __typename\n      }\n      __typename\n    }\n    webUrl\n    __typename\n  }\n  likeCount\n  viewerHasLiked\n  commentCount\n}\n\nfragment ReviewerProfileFragment on User {\n  id: legacyId\n  imageUrlSquare\n  isAuthor\n  ...SocialUserFragment\n  textReviewsCount\n  viewerRelationshipStatus {\n    isBlockedByViewer\n    __typename\n  }\n  name\n  webUrl\n  contributor {\n    id\n    works {\n      totalCount\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment SocialUserFragment on User {\n  viewerRelationshipStatus {\n    isFollowing\n    isFriend\n    __typename\n  }\n  followersCount\n  __typename\n}\n"
            }
    
    gatherNextPage("kca://work/amzn1.gr.work.v3.JeHZlXvg2e1mD9_k")
    

    What is resourceId:

    well, resourceId is actually workId from your URL reviewFilters={%22workId%22:%22kca://work/amzn1.gr.work.v3.JeHZlXvg2e1mD9_k%22,%22after%22:%22MjYwMTYsMTY2MDc1MjY5MjY2Mw%22}

    I hope this will help.

    Thanks

    2nd Edit:

    Hi after analyzing the requests with the proxy tool I found out that some creator's objects don't have user_Id on them (I don't know why) that's why the line: 19 shows the TypeError: 'NoneType' object is not subscriptable exception, well this can be fixed by adding exception handler in get_data() function like this:

    def get_data(content):
        data = content['data']['getReviews']['edges']
        for i in data:
            try: #Exception happend because of blank userId Like this, eg: '{Timestamp('2023-01-12 19:33:29.111000')},{'0_0 stephen'},{0},{'006'},,{False}'
                id_user = i['node']['creator']['id']
                rev_img = i['node']['creator']['imageUrlSquare']
                is_author = i['node']['creator']['isAuthor']
                follower_count = i['node']['creator']['followersCount']
                name = i['node']['creator']['name']
                review = BeautifulSoup(i['node']['text'], "lxml").text #removed HTML tag from text
                review_create_data = i['node']['createdAt']
                result_ms = pd.to_datetime(review_create_data,unit='ms') #decode timestamp 
                review_liked = i['node']['likeCount']
                rating = i['node']['rating']
                                                                                                                                        #\nreviewer_image:  {rev_img} between resultms and \nreviewer_follower
                print(f"reviewers_name:  {name}\nreviewer_user_id:  {id_user}\nIs reviewr is author:  {is_author}\nreview_date:  {result_ms}\nreviewer_follower:  {follower_count}\nreview:  {review}\nreview_ratings:  {rating}\nreview_liked:  {review_liked}\n========================================")
    
                #print(f"Is reviewr is author:  {is_author}\nreviewer_follower:  {follower_count}\nreview_liked:  {review_liked}\n========================================")
    
    
                ReviewText_content = {review}
                rating_text = {rating}
                Reviewer_name = {name}
                reviewdt_content = {result_ms}
                reviewer_id = {id_user}
                author = {is_author}
                
                # Append the Reviewer_text to the list of ratings
                Reviewer.append(Reviewer_name)
                reviewdt.append(reviewdt_content)
                ratings.append(rating_text)    
                ReviewText.append(ReviewText_content)
                ReviewerId.append(reviewer_id)
                Author_ID.append(author)
            except Exception:
                pass
    

    Let me know if I missed something regarding the error you've faced!

    3rd Edit:

    lang detection using langdetect module pip3 install landetect

    from langdetect import detect
    
    ReviewText = []
    ratings = []
    Reviewer = []
    reviewdt = []
    ReviewerId = []
    Author_ID = []
    Review_language = []
    
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
    
    def get_data(content):
        data = content['data']['getReviews']['edges']
        for i in data:
            try: #Exception happend because of blank userId Like this, eg: '{Timestamp('2023-01-12 19:33:29.111000')},{'0_0 stephen'},{0},{'006'},,{False}'
                id_user = i['node']['creator']['id']
                rev_img = i['node']['creator']['imageUrlSquare']
                is_author = i['node']['creator']['isAuthor']
                follower_count = i['node']['creator']['followersCount']
                name = i['node']['creator']['name']
                review = BeautifulSoup(i['node']['text'], "lxml").text #removed HTML tag from text
                review_create_data = i['node']['createdAt']
                result_ms = pd.to_datetime(review_create_data,unit='ms') #decode timestamp 
                review_liked = i['node']['likeCount']
                rating = i['node']['rating']
                review_lang = detect(review)                                                                                                                 #\nreviewer_image:  {rev_img} between resultms and \nreviewer_follower
                print(f"reviewers_name:  {name}\nreviewer_user_id:  {id_user}\nIs reviewr is author:  {is_author}\nreview_date:  {result_ms}\nreviewer_follower:  {follower_count}\nreview:  {review}\nreview_language:  {review_lang}\nreview_ratings:  {rating}\nreview_liked:  {review_liked}\n========================================")
    
                #print(f"Is reviewr is author:  {is_author}\nreviewer_follower:  {follower_count}\nreview_liked:  {review_liked}\n========================================")
    
    
                ReviewText_content = {review}
                rating_text = {rating}
                Reviewer_name = {name}
                reviewdt_content = {result_ms}
                reviewer_id = {id_user}
                author = {is_author}
                lang_detect = {review_lang}
                
                # Append the Reviewer_text to the list of ratings
                Reviewer.append(Reviewer_name)
                reviewdt.append(reviewdt_content)
                ratings.append(rating_text)    
                ReviewText.append(ReviewText_content)
                ReviewerId.append(reviewer_id)
                Author_ID.append(author)
                Review_language.append(lang_detect)
            except Exception:
                pass
    

    Thanks