So my scraper is only sending the last two items to csv from last page it processed.I can not figure out where i am doing wrong it prints output perfectly fine.May be experience set of eyes able to help.
Code Below:
from requests_html import HTMLSession
import csv
import time
def get_links(url):
_request = _session.get(url)
items = _request.html.find('li.product-grid-view.product.sale')
links = []
for item in items:
links.append(item.find('a', first=True).attrs['href'])
# print(len(links))
return links
def get_product(link):
_request = _session.get(link)
title = _request.html.find('h2', first=True).full_text
price = _request.html.find('span.woocommerce-Price-amount.amount bdi')[1].full_text
sku = _request.html.find('span.sku', first=True).full_text
categories = _request.html.find('span.posted_in', first=True).full_text.replace('Categories:', "").strip()
brand = _request.html.find('span.posted_in')[1].full_text.replace('Brand:', "").strip()
#print(brand)
product = {
'Title': title,
'Price': price,
'SKU': sku,
'Categories': categories,
'Brand': brand
}
#print(product)
return product
if __name__ == '__main__':
for page in range(1, 4):
url = 'https://www.thebassplace.com/product-category/basses/4-string/'
if page == 1:
parse_url = url
else:
parse_url = f'https://www.thebassplace.com/product-category/basses/4-string/page/{page}/'
_session = HTMLSession()
links = get_links(parse_url)
results = []
for link in links:
results.append(get_product(link))
time.sleep(1)
#print(len(results))
with open('on_sale_bass.csv', 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=results[0].keys())
writer.writeheader()
for row in results:
writer.writerow(row)
when i try to append records are written in csv but headers are repeating for each page iteration.
The problem was in the statement results = []
, inside the range loop. You emptied the results
on each iteration of the range(1, 4)
loop. Thus, you were getting only what the last iteration brought in.
Note, I made the _session
as global
, but in this case it would be reasonable, in my opinion (feel free to correct), to just pass it between functions. Now, try this out.
from requests_html import HTMLSession
import csv
import time
def get_links(url):
global _session
_request = _session.get(url)
items = _request.html.find('li.product-grid-view.product.sale')
links = []
for item in items:
links.append(item.find('a', first=True).attrs['href'])
return links
def get_product(link):
global _session
_request = _session.get(link)
title = _request.html.find('h2', first=True).full_text
price = _request.html.find('span.woocommerce-Price-amount.amount bdi')[1].full_text
sku = _request.html.find('span.sku', first=True).full_text
categories = _request.html.find('span.posted_in', first=True).full_text.replace('Categories:', "").strip()
brand = _request.html.find('span.posted_in')[1].full_text.replace('Brand:', "").strip()
product = {
'Title': title,
'Price': price,
'SKU': sku,
'Categories': categories,
'Brand': brand
}
return product
if __name__ == '__main__':
results = []
for page in range(1, 4):
url = 'https://www.thebassplace.com/product-category/basses/4-string/'
if page == 1:
parse_url = url
else:
parse_url = f'https://www.thebassplace.com/product-category/basses/4-string/page/{page}/'
_session = HTMLSession()
links = get_links(parse_url)
for link in links:
product = get_product(link)
results.append(product)
#time.sleep(1)
with open('on_sale_bass.csv', 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=results[0].keys())
writer.writeheader()
for row in results:
writer.writerow(row)
What I get as an example: