I scrape data from the website using Selenium and BS4 and save it to json file. Since there is no pagination structure, I use web driver with selenium, but before adding selenium, while my old code was running, I now see it as an empty json file while collecting the data. How can I fix it without breaking the existing structure?
My old code (successfully collects data)
from bs4 import BeautifulSoup
import cloudscraper
import json
url = "https://www.brickeconomy.com/sets/year/2024"
# Create a scraper instance
scraper = cloudscraper.create_scraper()
# Send a GET request to the URL
response = scraper.get(url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# List to hold all set data
sets_data = []
# Find all table rows containing set information
table_rows = soup.find('table', id='ContentPlaceHolder1_ctlSets_GridViewSets').find_all('tr', align='left')
# Iterate over each row to extract set details
for row in table_rows:
set_info = {}
# Find the <h4> element containing the set name and ID
set_name_elem = row.find('h4')
if set_name_elem:
set_string = set_name_elem.text.strip()
set_info['id'], set_info['name'] = set_string.split(' ', 1)
# Find <div> elements containing Year, Pieces/Minifigs, and other information
div_elements = row.find_all('div', class_='mb-2')
for div in div_elements:
label = div.find('small', class_='text-muted mr-5')
if label:
label_text = label.text.strip()
if label_text == 'Year':
set_info['year'] = div.text.replace('Year', '').strip()
# Find all <td> elements with class="ctlsets-right text-right"
td_elements = row.find_all('td', class_='ctlsets-right text-right')
# Process each <td> element
for td in td_elements:
div_elements = td.find_all('div')
for div in div_elements:
# If the div content contains "Retail", get the price from the next sibling
if "Retail" in div.text:
retail_price = div.text.strip()
price_without_retail = ' '.join(retail_price.split()[1:])
set_info['price'] = price_without_retail
first_sibling = div.find_next_sibling()
if first_sibling:
content = first_sibling.text.strip()
set_info['retail'] = content
second_sibling = first_sibling.find_next_sibling()
if second_sibling:
content2 = second_sibling.text.strip()
set_info['detail'] = content2
else:
set_info['detail'] = "None"
else:
print("Not Found Retail.")
# Add the set information to the list
sets_data.append(set_info)
# Convert the extracted set data to JSON format and write to a file
with open('sets_data.json', 'w') as json_file:
json.dump(sets_data, json_file, ensure_ascii=False, indent=4)
print("Sets data extracted successfully and saved to sets_data.json.")
else:
print("HTTP Error Code:", response.status_code)
My current code (With web driver):
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
# Initialize WebDriver (Safari, Chrome, Firefox, etc.)
driver = webdriver.Chrome() # or change to webdriver.Firefox() or webdriver.Safari()
url = "https://www.brickeconomy.com/sets/year/2024"
max_iterations = 2 # Specify how many pages to fetch
delay_seconds = 2 # Delay time between each page transition (seconds)
all_sets_data = [] # List to hold all set data
try:
for i in range(max_iterations):
driver.get(url)
# Wait for the table to load when the page is loaded
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'ContentPlaceHolder1_ctlSets_GridViewSets')))
# Process the HTML content of the page using BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
sets_data = []
# Find all rows in the table
table = soup.find('table', id='ContentPlaceHolder1_ctlSets_GridViewSets')
if table:
table_rows = table.find_all('tr', align='left')
# Extract set information from each row
for row in table_rows:
set_info = {}
# Find the <h4> element containing the set name
set_name_elem = row.find('h4')
if set_name_elem:
set_string = set_name_elem.text.strip()
set_info['id'], set_info['name'] = set_string.split(' ', 1)
# Find <div> elements containing Year and other information
div_elements = row.find_all('div', class_='mb-2')
for div in div_elements:
label = div.find('small', class_='text-muted mr-5')
if label:
label_text = label.text.strip()
if label_text == 'Year':
set_info['year'] = div.text.replace('Year', '').strip()
sets_data.append(set_info)
# Add the extracted set data to the list of all sets
all_sets_data.extend(sets_data)
print(f"Sets data for iteration {i + 1} extracted successfully.")
# Click the "Next" button to go to the next page
next_button = driver.find_element(By.XPATH, "//a[contains(text(), 'Next')]")
if next_button:
next_button.click()
# Wait for a specified time before the next iteration (rate limiting)
time.sleep(delay_seconds)
else:
print("Next button not found. Exiting loop.")
break
else:
print("Table not found. Exiting loop.")
break
except Exception as e:
print(f"An error occurred: {str(e)}")
finally:
# Close the WebDriver
driver.quit()
# Write all set data to a single JSON file
if all_sets_data:
with open('all_sets_data.json', 'w') as json_file:
json.dump(all_sets_data, json_file, ensure_ascii=False, indent=4)
print("All sets data extracted successfully and saved to all_sets_data.json.")
else:
print("No sets data extracted or saved.")
Current output:
[
{},
{},
{},
{},
{},
...
]
Here is another version without using selenium:
import requests
from bs4 import BeautifulSoup
url = "https://www.brickeconomy.com/sets/year/2024"
def get_data(soup):
data = {}
for inp in soup.select("input[value]"):
data[inp["name"]] = inp["value"]
del data["ctl00$ContentPlaceHolder1$ctlSets$cmdPBOwnedWantedChanged"]
del data["ctl00$cmdRegionModalPB"]
del data["ctl00$cmdDefault"]
del data["ctl00$cmdLoginModalPB"]
del data["ctl00$cmdSearchHeader2"]
del data["ctl00$cmdSearchHeader"]
data["ctl00$ScriptManager1"] = (
"ctl00$ContentPlaceHolder1$ctlSets$UpdatePanelMain|ctl00$ContentPlaceHolder1$ctlSets$GridViewSets"
)
data["ctl00$txtSearchHeader2"] = ""
data["ctl00$txtSearchHeader"] = ""
data["__EVENTTARGET"] = "ctl00$ContentPlaceHolder1$ctlSets$GridViewSets"
data["__EVENTARGUMENT"] = "Page$1"
data["__ASYNCPOST"] = "true"
data["setsorter"] = "SetNumberASC"
data[""] = ""
return data
with requests.session() as s:
s.headers.update(
{
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:125.0) Gecko/20100101 Firefox/125.0"
}
)
# load cookies/POST data
soup = BeautifulSoup(s.get(url).text, "html.parser")
data = get_data(soup)
for p in range(1, 4): # <-- adjust number of pages here
data["__EVENTARGUMENT"] = f"Page${p}"
soup = BeautifulSoup(s.post(url, data=data).text, "html.parser")
for tr in soup.select("tr:has(a):not(:has(tr))"):
print(tr.h4.text)
# theme:
theme = ", ".join(s.text for s in tr.find("small").find_next_siblings())
print(theme)
for div in tr.select("div:has(>small)"):
k, v = div.small.text, div.small.find_next_sibling(string=True)
if v and v.strip():
print(k, v.strip())
print("-" * 80)
Prints:
...
--------------------------------------------------------------------------------
42603 Stargazing Camping Vehicle
Friends, Space
Year 2024
Pieces / Mini-doll figures 364 / 2
Availability Retail
Retail 29,99 €
--------------------------------------------------------------------------------
42604 Heartlake City Shopping Mall
Friends, Heartlake City
Year 2024
Pieces / Mini-doll figures 1,237 / 7
Availability Retail
Retail 119,99 €
--------------------------------------------------------------------------------
...