Search code examples
pythonpython-3.xselenium-webdriverweb-scrapingbeautifulsoup

Web Scraping with Python without pagination site


I scrape data from the website using Selenium and BS4 and save it to json file. Since there is no pagination structure, I use web driver with selenium, but before adding selenium, while my old code was running, I now see it as an empty json file while collecting the data. How can I fix it without breaking the existing structure?

My old code (successfully collects data)

from bs4 import BeautifulSoup
import cloudscraper
import json

url = "https://www.brickeconomy.com/sets/year/2024"

# Create a scraper instance
scraper = cloudscraper.create_scraper()

# Send a GET request to the URL
response = scraper.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # List to hold all set data
    sets_data = []

    # Find all table rows containing set information
    table_rows = soup.find('table', id='ContentPlaceHolder1_ctlSets_GridViewSets').find_all('tr', align='left')

    # Iterate over each row to extract set details
    for row in table_rows:
        set_info = {}

        # Find the <h4> element containing the set name and ID
        set_name_elem = row.find('h4')
        if set_name_elem:
            set_string = set_name_elem.text.strip()
            set_info['id'], set_info['name'] = set_string.split(' ', 1)

        # Find <div> elements containing Year, Pieces/Minifigs, and other information
        div_elements = row.find_all('div', class_='mb-2')

        for div in div_elements:
            label = div.find('small', class_='text-muted mr-5')
            if label:
                label_text = label.text.strip()

                if label_text == 'Year':
                    set_info['year'] = div.text.replace('Year', '').strip()

        # Find all <td> elements with class="ctlsets-right text-right"
        td_elements = row.find_all('td', class_='ctlsets-right text-right')

        # Process each <td> element
        for td in td_elements:
            div_elements = td.find_all('div')
            for div in div_elements:
                # If the div content contains "Retail", get the price from the next sibling
                if "Retail" in div.text:
                    retail_price = div.text.strip()
                    price_without_retail = ' '.join(retail_price.split()[1:])
                    set_info['price'] = price_without_retail

                    first_sibling = div.find_next_sibling()
                    if first_sibling:
                        content = first_sibling.text.strip()
                        set_info['retail'] = content

                        second_sibling = first_sibling.find_next_sibling()
                        if second_sibling:
                            content2 = second_sibling.text.strip()
                            set_info['detail'] = content2
                        else:
                            set_info['detail'] = "None"
                    else:
                        print("Not Found Retail.")

        # Add the set information to the list
        sets_data.append(set_info)

    # Convert the extracted set data to JSON format and write to a file
    with open('sets_data.json', 'w') as json_file:
        json.dump(sets_data, json_file, ensure_ascii=False, indent=4)

    print("Sets data extracted successfully and saved to sets_data.json.")

else:
    print("HTTP Error Code:", response.status_code)

My current code (With web driver):

import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Initialize WebDriver (Safari, Chrome, Firefox, etc.)
driver = webdriver.Chrome()  # or change to webdriver.Firefox() or webdriver.Safari()

url = "https://www.brickeconomy.com/sets/year/2024"
max_iterations = 2  # Specify how many pages to fetch
delay_seconds = 2  # Delay time between each page transition (seconds)

all_sets_data = []  # List to hold all set data

try:
    for i in range(max_iterations):
        driver.get(url)

        # Wait for the table to load when the page is loaded
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'ContentPlaceHolder1_ctlSets_GridViewSets')))

        # Process the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        sets_data = []

        # Find all rows in the table
        table = soup.find('table', id='ContentPlaceHolder1_ctlSets_GridViewSets')
        if table:
            table_rows = table.find_all('tr', align='left')

            # Extract set information from each row
            for row in table_rows:
                set_info = {}

                # Find the <h4> element containing the set name
                set_name_elem = row.find('h4')
                if set_name_elem:
                    set_string = set_name_elem.text.strip()
                    set_info['id'], set_info['name'] = set_string.split(' ', 1)

                # Find <div> elements containing Year and other information
                div_elements = row.find_all('div', class_='mb-2')

                for div in div_elements:
                    label = div.find('small', class_='text-muted mr-5')
                    if label:
                        label_text = label.text.strip()

                        if label_text == 'Year':
                            set_info['year'] = div.text.replace('Year', '').strip()

                sets_data.append(set_info)

            # Add the extracted set data to the list of all sets
            all_sets_data.extend(sets_data)

            print(f"Sets data for iteration {i + 1} extracted successfully.")

            # Click the "Next" button to go to the next page
            next_button = driver.find_element(By.XPATH, "//a[contains(text(), 'Next')]")
            if next_button:
                next_button.click()

                # Wait for a specified time before the next iteration (rate limiting)
                time.sleep(delay_seconds)
            else:
                print("Next button not found. Exiting loop.")
                break
        else:
            print("Table not found. Exiting loop.")
            break

except Exception as e:
    print(f"An error occurred: {str(e)}")

finally:
    # Close the WebDriver
    driver.quit()

    # Write all set data to a single JSON file
    if all_sets_data:
        with open('all_sets_data.json', 'w') as json_file:
            json.dump(all_sets_data, json_file, ensure_ascii=False, indent=4)
        print("All sets data extracted successfully and saved to all_sets_data.json.")
    else:
        print("No sets data extracted or saved.")

Current output:

[
    {},
    {},
    {},
    {},
    {},
...
]

Solution

  • Here is another version without using :

    import requests
    from bs4 import BeautifulSoup
    
    url = "https://www.brickeconomy.com/sets/year/2024"
    
    
    def get_data(soup):
        data = {}
        for inp in soup.select("input[value]"):
            data[inp["name"]] = inp["value"]
    
        del data["ctl00$ContentPlaceHolder1$ctlSets$cmdPBOwnedWantedChanged"]
        del data["ctl00$cmdRegionModalPB"]
        del data["ctl00$cmdDefault"]
        del data["ctl00$cmdLoginModalPB"]
        del data["ctl00$cmdSearchHeader2"]
        del data["ctl00$cmdSearchHeader"]
    
        data["ctl00$ScriptManager1"] = (
            "ctl00$ContentPlaceHolder1$ctlSets$UpdatePanelMain|ctl00$ContentPlaceHolder1$ctlSets$GridViewSets"
        )
        data["ctl00$txtSearchHeader2"] = ""
        data["ctl00$txtSearchHeader"] = ""
    
        data["__EVENTTARGET"] = "ctl00$ContentPlaceHolder1$ctlSets$GridViewSets"
        data["__EVENTARGUMENT"] = "Page$1"
        data["__ASYNCPOST"] = "true"
        data["setsorter"] = "SetNumberASC"
        data[""] = ""
    
        return data
    
    
    with requests.session() as s:
        s.headers.update(
            {
                "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:125.0) Gecko/20100101 Firefox/125.0"
            }
        )
        # load cookies/POST data
        soup = BeautifulSoup(s.get(url).text, "html.parser")
        data = get_data(soup)
    
        for p in range(1, 4):  # <-- adjust number of pages here
            data["__EVENTARGUMENT"] = f"Page${p}"
            soup = BeautifulSoup(s.post(url, data=data).text, "html.parser")
    
            for tr in soup.select("tr:has(a):not(:has(tr))"):
                print(tr.h4.text)
    
                # theme:
                theme = ", ".join(s.text for s in tr.find("small").find_next_siblings())
                print(theme)
    
                for div in tr.select("div:has(>small)"):
                    k, v = div.small.text, div.small.find_next_sibling(string=True)
                    if v and v.strip():
                        print(k, v.strip())
                print("-" * 80)
    

    Prints:

    ...
    
    --------------------------------------------------------------------------------
    42603 Stargazing Camping Vehicle
    Friends, Space
    Year 2024
    Pieces / Mini-doll figures 364 / 2
    Availability Retail
    Retail 29,99 €
    --------------------------------------------------------------------------------
    42604 Heartlake City Shopping Mall
    Friends, Heartlake City
    Year 2024
    Pieces / Mini-doll figures 1,237 / 7
    Availability Retail
    Retail 119,99 €
    --------------------------------------------------------------------------------
    
    ...