We have a vendor that sells many products that we want to include. they unfortunately do not have an API setup to retrieve information. they do however give us product lists that show just the SKU of an item. I want to log in, got to the page that holds the product, scrape specific info from that page and move on to the next. I want to do this line by line, one at a time since their website is fragile and prone to going offline.
basically this spider should:
import scrapy
from scrapy import Spider
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser
import csv
class SPIDER_NAME (Spider):
name = 'RPLogin_Final'
start_urls = ['WEBSITEURL']
def parse(self, response):
token = response.css("form input[name=__RequestVerificationToken]::attr(value)").extract_first()
return FormRequest(url=self.start_urls[0],
formdata={'__RequestVerificationToken': token,
'upw': 'PASSWORD',
'uid': 'USERNAME'},
callback=self.scrape_now)
def scrape_now(self, response):
print("logged in!")
# do stuff / go to next page
with open ('partsList.csv','r') as csv_file:
csv_reader = csv.reader(csv_file)
for row in csv_reader:
print(row)
yield scrapy.Request(url = row , callback=self.parse_product)
def parse_product(self, response):
product = response.css('div.row.jsCartContainer.product-list-item')
yield{
'Name' : product.css("p.jplist-text-filter::text").get(),
'Part_Num' : product.css("a.jplist-text-filter.jplist-item-num::text").get(),
'Purchase_Price' : product.css("li.jplist-item-price.bold::text").get(),
'Suggested_Retail' : product.css("li:nth-child(2)").get(),
'In_Stock' : product.css("li:nth-child(5)").get(),
'Image_Link' : product.css("img").get()
}
open_in_browser(response)
This spider worked fine when:
url = 'string_URL'
then I changed this to a csv file with:
with open('partsList.csv') as file:
url=[line.strip() for line in file]
and i received these errors:
logged in!
['URLS']
File "/home/partsales/Desktop/Python/parts-env/lib/python3.10/site-packages/scrapy/http/request/__init__.py", line 133, in _set_url
raise TypeError(f"Request url must be str, got {type(url).__name__}")
TypeError: Request url must be str, got list
So by using karel van dongen's suggestion, but replacing the csv file with a txt file, I was able to get this to execute perfectly!
complete (redacted) code below:
import scrapy
from scrapy import Spider
from scrapy import Request
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser
import csv
class PartsSpider(Spider):
name = 'productSpider'
start_urls = ['WEBSITE URL']
def parse(self, response):
token = response.css("form input[name=__RequestVerificationToken]::attr(value)").extract_first()
return FormRequest(url=self.start_urls[0],
formdata={'__RequestVerificationToken': token,
#
#Change USER AND PASSWORD
#
'upw': 'ENCODED PASSWORD',
'uid': 'USERNAME'},
callback=self.scrape_now)
def scrape_now(self, response):
print("logged in!")
#
# CHANGE FILE NAME
#
with open('partlist.txt') as partslist:
for line in partslist:
curr_url =line.rstrip()
print(curr_url)
yield Request(url = curr_url, callback = self.parse_product)
def parse_product(self, response):
product = response.css('div.row.jsCartContainer.product-list-item')
yield{
#
# CHANGE BRAND
#
'Brand' : 'BRAND',
'Name' : product.css("p.jplist-text-filter::text").get(),
'Part_Num' : product.css("a.jplist-text-filter.jplist-item-num::text").get().replace(' ',''),
'Purchase_Price' : product.css("li.jplist-item-price.bold::text").get().replace('Your Price:','').replace(' / EA',''),
'Suggested_Retail' : product.css("li:nth-child(2)").get().replace('<li>Suggested Retail Price: ','').replace('</li>',''),
'In_Stock' : product.css("li:nth-child(5)").get().replace('<li class="jplist-in-stock">Available: ','').replace('</li>',''),
'Image_Link' : product.css("img").get().replace("<img class=\"product-img\" src=\"","https://WEBSITE.com").replace('\"','')[:-16]
}