python python-3.x web-scraping beautifulsoup html-parsing

Python HTML Parser Pagination

I'm new to python and have managed to get this far trying the HTML Parser, but I'm stuck on how to get pagination for the reviews at the bottom of the page to work for the site.

The URL is in the PasteBin code, I am leaving out the URL in this thread for privacy reasons.

Any help is much appreciated.

# Reviews Scrape

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

my_url = 'EXAMPLE.COM'

# opening up connection, grabbing, the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

# HTML Parsing
page_soup = soup(page_html, "html.parser")

# Grabs each review
reviews = page_soup.findAll("div",{"class":"jdgm-rev jdgm-divider-top"})

filename = "compreviews.csv"
f = open(filename, "w")

headers = "Score, Title, Content\n"

f.write(headers)
# HTML Lookup Location per website and strips spacing
for container in reviews:
    # score = container.div.div.span["data-score"]
    score = container.findAll("span",{"data-score":True})
    user_score = score[0].text.strip()

    title_review = container.findAll("b",{"class":"jdgm-rev__title"})
    user_title = title_review[0].text.strip()

    content_review = container.findAll("div",{"class":"jdgm-rev__body"})
    user_content = content_review[0].text.strip()

    print("user_score:" + score[0]['data-score'])
    print("user_title:" + user_title)
    print("user_content:" + user_content)

    f.write(score[0]['data-score'] + "," +user_title + "," +user_content + "\n")

f.close()

Solution

The page does an xhr GET request using a query string to get results. This query string has parameters for reviews per page and page number. You can make an initial request with what seems like the max reviews per page of 31, extract the html from the json returned then grab the page count; write a loop to run over all pages getting results. Example construct below:

import requests
from bs4 import BeautifulSoup as bs

start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'

with requests.Session() as s:
    r = s.get(start_url).json()
    soup = bs(r['html'], 'lxml')
    print([i.text for i in soup.select('.jdgm-rev__author')])
    print([i.text for i in soup.select('.jdgm-rev__title')])
    total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])

    for page in range(2, total_pages + 1):
        r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
        soup = bs(r['html'], 'lxml')
        print([i.text for i in soup.select('.jdgm-rev__author')])
        print([i.text for i in soup.select('.jdgm-rev__title')]) #etc

Example dataframe to csv

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'

authors = []
titles = []

with requests.Session() as s:
    r = s.get(start_url).json()
    soup = bs(r['html'], 'lxml')
    authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
    titles.extend([i.text for i in soup.select('.jdgm-rev__title')])
    total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])

    for page in range(2, total_pages + 1):
        r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
        soup = bs(r['html'], 'lxml')
        authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
        titles.extend([i.text for i in soup.select('.jdgm-rev__title')]) #etc

headers = ['Author','Title']
df = pd.DataFrame(zip(authors,titles), columns = headers)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8',index = False )