I'm new to python and have managed to get this far trying the HTML Parser, but I'm stuck on how to get pagination for the reviews at the bottom of the page to work for the site.
The URL is in the PasteBin code, I am leaving out the URL in this thread for privacy reasons.
Any help is much appreciated.
# Reviews Scrape
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'EXAMPLE.COM'
# opening up connection, grabbing, the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# HTML Parsing
page_soup = soup(page_html, "html.parser")
# Grabs each review
reviews = page_soup.findAll("div",{"class":"jdgm-rev jdgm-divider-top"})
filename = "compreviews.csv"
f = open(filename, "w")
headers = "Score, Title, Content\n"
f.write(headers)
# HTML Lookup Location per website and strips spacing
for container in reviews:
# score = container.div.div.span["data-score"]
score = container.findAll("span",{"data-score":True})
user_score = score[0].text.strip()
title_review = container.findAll("b",{"class":"jdgm-rev__title"})
user_title = title_review[0].text.strip()
content_review = container.findAll("div",{"class":"jdgm-rev__body"})
user_content = content_review[0].text.strip()
print("user_score:" + score[0]['data-score'])
print("user_title:" + user_title)
print("user_content:" + user_content)
f.write(score[0]['data-score'] + "," +user_title + "," +user_content + "\n")
f.close()
The page does an xhr GET request using a query string to get results. This query string has parameters for reviews per page and page number. You can make an initial request with what seems like the max reviews per page of 31, extract the html from the json returned then grab the page count; write a loop to run over all pages getting results. Example construct below:
import requests
from bs4 import BeautifulSoup as bs
start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'
with requests.Session() as s:
r = s.get(start_url).json()
soup = bs(r['html'], 'lxml')
print([i.text for i in soup.select('.jdgm-rev__author')])
print([i.text for i in soup.select('.jdgm-rev__title')])
total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])
for page in range(2, total_pages + 1):
r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
soup = bs(r['html'], 'lxml')
print([i.text for i in soup.select('.jdgm-rev__author')])
print([i.text for i in soup.select('.jdgm-rev__title')]) #etc
Example dataframe to csv
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'
authors = []
titles = []
with requests.Session() as s:
r = s.get(start_url).json()
soup = bs(r['html'], 'lxml')
authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
titles.extend([i.text for i in soup.select('.jdgm-rev__title')])
total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])
for page in range(2, total_pages + 1):
r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
soup = bs(r['html'], 'lxml')
authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
titles.extend([i.text for i in soup.select('.jdgm-rev__title')]) #etc
headers = ['Author','Title']
df = pd.DataFrame(zip(authors,titles), columns = headers)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8',index = False )