Search code examples
pythonpython-3.xweb-scrapingbeautifulsouphtml-parsing

Python HTML Parser Pagination


I'm new to python and have managed to get this far trying the HTML Parser, but I'm stuck on how to get pagination for the reviews at the bottom of the page to work for the site.

The URL is in the PasteBin code, I am leaving out the URL in this thread for privacy reasons.

Any help is much appreciated.

# Reviews Scrape

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

my_url = 'EXAMPLE.COM'

# opening up connection, grabbing, the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

# HTML Parsing
page_soup = soup(page_html, "html.parser")

# Grabs each review
reviews = page_soup.findAll("div",{"class":"jdgm-rev jdgm-divider-top"})

filename = "compreviews.csv"
f = open(filename, "w")

headers = "Score, Title, Content\n"

f.write(headers)
# HTML Lookup Location per website and strips spacing
for container in reviews:
    # score = container.div.div.span["data-score"]
    score = container.findAll("span",{"data-score":True})
    user_score = score[0].text.strip()

    title_review = container.findAll("b",{"class":"jdgm-rev__title"})
    user_title = title_review[0].text.strip()

    content_review = container.findAll("div",{"class":"jdgm-rev__body"})
    user_content = content_review[0].text.strip()

    print("user_score:" + score[0]['data-score'])
    print("user_title:" + user_title)
    print("user_content:" + user_content)

    f.write(score[0]['data-score'] + "," +user_title + "," +user_content + "\n")

f.close()

Solution

  • The page does an xhr GET request using a query string to get results. This query string has parameters for reviews per page and page number. You can make an initial request with what seems like the max reviews per page of 31, extract the html from the json returned then grab the page count; write a loop to run over all pages getting results. Example construct below:

    import requests
    from bs4 import BeautifulSoup as bs
    
    start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'
    
    with requests.Session() as s:
        r = s.get(start_url).json()
        soup = bs(r['html'], 'lxml')
        print([i.text for i in soup.select('.jdgm-rev__author')])
        print([i.text for i in soup.select('.jdgm-rev__title')])
        total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])
    
        for page in range(2, total_pages + 1):
            r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
            soup = bs(r['html'], 'lxml')
            print([i.text for i in soup.select('.jdgm-rev__author')])
            print([i.text for i in soup.select('.jdgm-rev__title')]) #etc
    

    Example dataframe to csv

    import requests
    from bs4 import BeautifulSoup as bs
    import pandas as pd
    
    start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'
    
    authors = []
    titles = []
    
    with requests.Session() as s:
        r = s.get(start_url).json()
        soup = bs(r['html'], 'lxml')
        authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
        titles.extend([i.text for i in soup.select('.jdgm-rev__title')])
        total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])
    
        for page in range(2, total_pages + 1):
            r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
            soup = bs(r['html'], 'lxml')
            authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
            titles.extend([i.text for i in soup.select('.jdgm-rev__title')]) #etc
    
    headers = ['Author','Title']
    df = pd.DataFrame(zip(authors,titles), columns = headers)
    df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8',index = False )