I'm having trouble with a crawler script that I created on Python. I'm grabbing the features and specifications (which are bullet points) for a list of product urls:
import csv
import requests
from bs4 import BeautifulSoup
url = 'https://www.academy.com/shop/browse/footwear/womens-footwear/womens-work-boots?&page_{}'
def trade_spider(max_pages):
data = []
page = 1
while page <= max_pages:
current_url = url.format(page)
source_code = requests.get(current_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text,'html.parser')
for link in soup.findAll('a', {'class':'product-detail-container'}):
href ="https://www.academy.com/" + link.get('href')
name=get_single_item_data1(href)
features=get_single_item_data5(href)
specs=get_single_item_data6(href)
entry = [name, features, specs]
data.append(entry)
page += 1
return data
def get_single_item_data1(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text,'html.parser')
for item_name in soup.findAll('div', {'class':'flex-wrap flex-fill'}):
print ('name:', item_name.string)
return item_name.string
def get_single_item_data5(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text,'html.parser')
for features in soup.findAll('li', {'data-auid':'feature-benefits-listing'}):
print('features: ', features.string)
return features.string
def get_single_item_data6(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text,'html.parser')
for spec in soup.findAll('li', {'data-auid':'specifications_listing'}):
print('specifications: ', spec.text)
return spec.text
row_data = trade_spider(1)
row_headers = ['name','features', 'specs']
with open('data.csv', 'w') as f:
write = csv.writer(f)
write.writerow(row_headers)
write.writerows(row_data)
print ()
The scraping works whenever I print out the results, but when it comes to saving them into a csv file, only the first bullet point gets printed.
Desired sample output:
Name: Lace Up Work Boots
Features: Steel toes safeguard your feet Suede uppers provide durability 5.5" boot height Meets ASTM F2413-11 I/75 C/75 standards Goodyear welt construction EVA insoles offer cushioning
Specifications: Activity: Work Steel toe: Yes Safety Toe: Steel Gender: Women's Material: Suede Boot height (in.): 6" and Under Slip Resistant: Yes Waterproof: No Electrical hazard (EH) rated: No
But instead I'm only getting as of now:
Name: Lace Up Work Boots
Features: Steel toes safeguard your feet
Specifications: Activity: Work
Any help?
You can use this example how to grab data from the items and save them to CSV:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.academy.com/shop/browse/footwear/womens-footwear/womens-work-boots?&page_{}"
def grab_data(url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
title = soup.h1.get_text(strip=True)
features = [
f.get_text(strip=True)
for f in soup.select('[data-auid="feature-benefits-listing"]')
]
specs = [
f.get_text(strip=True)
for f in soup.select('[data-auid="specifications_listing"]')
]
return title, features, specs
with open("data.csv", "w") as f_in:
csv_writer = csv.writer(f_in)
csv_writer.writerow(["Name", "Features", "Specifications"])
for page in range(1, 2): # <-- adjust to your page count
soup = BeautifulSoup(
requests.get(url.format(page)).content, "html.parser"
)
for a in soup.select("a.detail-card"):
u = "https://www.academy.com" + a["href"]
title, feats, specs = grab_data(u)
print(title)
print("*** Features:")
print(*feats, sep="\n")
print("*** Specifications:")
print(*specs, sep="\n")
print("-" * 80)
csv_writer.writerow([title, " ".join(feats), " ".join(specs)])
Prints:
Brazos Women's Tradesman Steel Toe Lace Up Work Boots
*** Features:
Steel toes safeguard your feet
Suede uppers provide durability
5.5" boot height
Meets ASTM F2413-11 I/75 C/75 standards
Goodyear welt construction
EVA insoles offer cushioning
*** Specifications:
Activity:Work
Steel toe:Yes
Safety Toe:Steel
Gender:Women's
Material:Suede
Boot height (in.):6" and Under
Slip Resistant:Yes
Waterproof:No
Electrical hazard (EH) rated:No
--------------------------------------------------------------------------------
Fila Women's Memory Workshift Service Shoes
*** Features:
Slip-resistant rubber outsoles
Synthetic uppers
Low-top style
EVA midsoles
*** Specifications:
Activity:Work
Steel toe:No
Safety Toe:Soft
Gender:Women's
Material:Man-made Materials
Slip Resistant:Yes
Waterproof:No
Electrical hazard (EH) rated:No
--------------------------------------------------------------------------------
...and so on.
and saves data.csv
(Screenshot from LibreOffice):