I have an issue crawling websites when I try to run the spider from a Google Colab script:
code for class:
class Campaign_Spider(scrapy.Spider):
#name of the spider
name = "crowdfunder"
# First Start Url
start_urls= ["https://www.crowdfunder.co.uk/search/projects?category=Business&map=off"]
npages = 83 # For full list of listed campaignswe could set this to 83
# This mimics getting the pages using the next button.
for i in range(2, npages + 2 ):
start_urls.append("https://www.crowdfunder.co.uk/search/projects?page="+str(i)+"&category=Business&map=off")
def parse(self, response):
#print('This is the response' + response.url)
for href in response.xpath("//a[contains(@class, 'cf-pod__link')]//@href"):
url = href.extract()
yield scrapy.Request(url, callback=self.parse_page)
def parse_page(self, response):
pass
#href = response.xpath("//a[contains(@class, 'cf-pod__link')]//@href")
# Extract the information
# ...
#yield {
#'url': response.request.meta['referer'],
# ...
#}
Code for wrapper to crawl:
# the wrapper to make it run more times
def run_spider(spider):
def f(q):
try:
runner = CrawlerProcess(settings={'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
'FEEDS': {'crowdfunder.csv': {'format': 'csv', 'overwrite': True}}})
deferred = runner.crawl(spider)
deferred.addBoth(lambda _: reactor.stop())
reactor.run()
q.put(None)
except Exception as e:
q.put(e)
q = Queue()
p = Process(target=f, args=(q,))
p.start()
result = q.get()
p.join()
if result is not None:
raise result
I have tried printing the response in the code and would like for each of the urls in the start_urls list to refer the spider to further webpages to explore. I would appreciate any advice as I would like to implement this spider from a Google Colab script.
The output from the config_log:
config_log() Heading -> https://i.sstatic.net/7PjEc.png
Response/no crawling -> https://i.sstatic.net/yW5vN.png
Spider stats output log -> https://i.sstatic.net/MATrA.png
Here are the steps I followed to get to the solution.
JSON
file, and I copied the request's headers, body, and url.Content-Length
from the headers, and that the request for the API is a POST
request.
json.dumps()
function on the payload.
response.json()
.import scrapy
import json
class Campaign_Spider(scrapy.Spider):
name = "crowdfunder"
npages = 83 # For full list of listed campaignswe could set this to 83
# https://docs.scrapy.org/en/latest/topics/practices.html#avoiding-getting-banned
custom_settings = {
'DOWNLOAD_DELAY': 0.6
}
def start_requests(self):
api_url = 'https://7izdzrqwm2-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia for JavaScript (3.35.1); Browser (lite)&x-algolia-application-id=7IZDZRQWM2&x-algolia-api-key=9767ce6d672cff99e513892e0b798ae2'
# headers for the API request
headers = {
"Accept": "application/json",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"content-type": "application/x-www-form-urlencoded",
"DNT": "1",
"Host": "7izdzrqwm2-dsn.algolia.net",
"Origin": "https://www.crowdfunder.co.uk",
"Pragma": "no-cache",
"Referer": "https://www.crowdfunder.co.uk/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "cross-site",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
}
for i in range(1, self.npages + 2):
payload = {
"requests": [{
"indexName": "frontendsearch",
"params": f"facetFilters=%5B%22category%3ABusiness%22%5D&hitsPerPage=12&page={str(i)}&aroundPrecision=1000&distinct=true&query=&insideBoundingBox=&facets=%5B%5D&tagFilters="
}]
}
yield scrapy.Request(url=api_url, body=json.dumps(payload), method='POST', headers=headers)
def parse(self, response):
json_data = response.json()
base_url = 'https://www.crowdfunder.co.uk'
for hit in json_data.get('results')[0].get('hits'):
url = f"{base_url}{hit.get('uri')}"
# don't forget to add whatever headers you need
yield scrapy.Request(url, callback=self.parse_page)
def parse_page(self, response):
# parse whatever you want here from each webpage
pass