Search code examples
pythonbeautifulsoup

How to scrape all customer reviews?


I am trying to scrape all reviews in this website - https://www.backmarket.com/en-us/r/l/airpods/345c3c05-8a7b-4d4d-ac21-518b12a0ec17. The website says there are 753 reviews, but when I try to scrape all reviews, I get only 10 reviews. So, I am not sure how to scrape all 753 reviews from the page, Here is my code-

# importing modules 
import pandas as pd
from requests import get
from bs4 import BeautifulSoup

# Fetch the web page
url = 'https://www.backmarket.com/en-us/r/l/airpods/345c3c05-8a7b-4d4d-ac21-518b12a0ec17'
response = get(url) # link exlcudes posts with no picures
page = response.text

# Parse the HTML content
soup = BeautifulSoup(page, 'html.parser')

# To see different information
## reviewer's name 

reviewers_name = soup.find_all('p', class_='body-1-bold') 
[x.text for x in reviewers_name]

name = []

for items in reviewers_name:
    name.append(items.text if items else None)

## Purchase Data 

purchase_date = soup.find_all('p', class_='text-static-default-low body-2')
[x.text for x in purchase_date]

date = []
for items in purchase_date:
    date.append(items.text if items else None)


## Country 

country_text = soup.find_all('p', class_='text-static-default-low body-2 mt-32')
[x.text for x in country_text]

country = []

for items in country_text:
    country.append(items.text if items else None)


## Reviewed Products 

products_text = soup.find_all('span', class_= 'rounded-xs inline-block max-w-full truncate body-2-bold px-4 py-0 bg-static-default-mid text-static-default-hi')
[x.text for x in products_text]

products = []

for items in products_text:
    products.append(items.text if items else None)

## Actual Reviews 

review_text = soup.find_all('p',class_='body-1 block whitespace-pre-line')
[x.text for x in review_text]

review = []

for items in review_text:
    review.append(items.text if items else None)


## Review Ratings 

review_ratings_value = soup.find_all('span',class_='ml-4 mt-1 md:mt-2 body-2-bold')
[x.text for x in review_ratings_value]

review_ratings = []

for items in review_ratings_value:
    review_ratings.append(items.text if items else None)



# Create the Data Frame 
pd.DataFrame({
    'reviewers_name': name,
    'purchase_date': date,
    'country': country,
    'products': products,
    'review': review,
    'review_ratings': review_ratings
})

My question is how I can scrape all reviews.


Solution

  • based on your expectations, I think using the requests library and a little bit of code can fetch your desired result, here is my mindmap:

    we can use this https://www.backmarket.com/reviews/product-landings/345c3c05-8a7b-4d4d-ac21-518b12a0ec17/products/reviews API endpoint to fetch all of your expected information related to reviews. (I guess UUID in the URLs is your product ID, correct me if I'm wrong)

    Note: The site has rate limit protection in so I used time.sleep(5) to minimize thread

    Here is my code:

    import time
    import requests
    from requests.packages.urllib3.exceptions import InsecureRequestWarning
    
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
    
    def get_data(content):
        result = content['results']
        for i in result:
            first_name = i['customer']['firstName']
            last_name = i['customer']['lastName']
            name = f"{first_name} {last_name}"
            rating = i['averageRate']
            review = i['comment']
            date = i['createdAt']
            prod = i['product']['title']
            prod_img = i['product']['imageUrl']
            country = i['countryCode']
            
            
            print(f"reviewers_name:  {name}\npurchase_date:  {date}\ncountry:  {country}\nproducts:-----------\nproduct_name:  {prod}\nproduct_img:  {prod_img}\n---------------------\nreview:  {review}\nreview_ratings:  {rating}\n============================")
    
    def gather_cursor(url):
        n, cursor_and_url = 1, url
        while True:
            time.sleep(3)
            response = requests.get(cursor_and_url, verify=False)
            data = response.json()
            get_data(data)
            cursor = data['nextCursor']
            if not cursor:
                break
    
            cursor_and_url = f"{url}?cursor={cursor}"
    
    
    gather_cursor("https://www.backmarket.com/reviews/product-landings/345c3c05-8a7b-4d4d-ac21-518b12a0ec17/products/reviews")
    

    Hope this will help.