I have a scrapy project that works fine and returns resutls i want. The spider code is:
class ExampleSpider(scrapy.Spider):
name = 'example'
myBaseUrl = ''
start_urls = []
def __init__(self, symbol_url='', **kwargs):
super().__init__(**kwargs)
self.myBaseUrl = symbol_url
self.start_urls.append(self.myBaseUrl)
custom_settings = {'FEED_URI': 'output/output.json', 'CLOSESPIDER_TIMEOUT' : 50}
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse, args={'wait': 5})
def parse(self, response):
tbody = response.css('tbody.scrollContent')
row = tbody.css('tr.table__row.ng-scope')
col = row.css('td')
links = col.css('a.letter-title.ng-binding.ng-scope::attr(href)')
for link in links:
yield response.follow("https://www.example.ir"+link.get(), callback=self.parse_page)
def parse_page(self, response):
page = bs(response.text, 'html.parser')
name = page.find('head', id='Head1').text.strip().split('(')[-1]
name = name.replace(')', '').strip()
try:
tables = page.find_all('tr', class_="ComputationalRow")
table_head = page.find('table', {'id':'exampleID'})
date = table_head.findAll('th')[-1].text.split(' ')[3]
yield MyItem(name=name, date=date)
except:
pass
And here is my Flask code to run the spider:
import crochet
crochet.setup()
from flask import Flask , render_template, jsonify, request, redirect, url_for
from scrapy import signals
from scrapy.crawler import CrawlerRunner
from scrapy.signalmanager import dispatcher
import time
import os
from EScrapy.example.spiders.example import ExampleSpider
app = Flask(__name__)
output_data = []
crawl_runner = CrawlerRunner()
@app.route('/')
def index():
return render_template("index.html")
@app.route('/', methods=['POST'])
def submit():
if request.method == 'POST':
s = request.form['symbol']
global baseURL
baseURL = f'https://example.ir/ReportList.aspx?search&Symbol={s}'
if os.path.exists("output/output.json"):
os.remove("output/output.json")
return redirect(url_for('scrape'))
@app.route("/scrape")
def scrape():
scrape_with_crochet(baseURL=baseURL)
time.sleep(20)
return jsonify(output_data)
@crochet.run_in_reactor
@crochet.wait_for(timeout=10)
def scrape_with_crochet(baseURL):
dispatcher.connect(_crawler_result, signal=signals.item_scraped)
eventual = crawl_runner.crawl(ExampleSpider, symbol_url = baseURL)
return eventual
def _crawler_result(item, response, spider):
output_data.append(dict(item))
if __name__== "__main__":
app.run(debug=True)
My spider works properly alone when put the link in start_urls
and delete __init__
method. It returns json file but when I want to run the code from flask application i saw this errors in my flask debug mode in commandline and Flask returns empty list:
Unhandled error in EventualResult
Traceback (most recent call last):
File "[..]/venv/lib/python3.9/site-packages/twisted/internet/defer.py", line 662, in callback
self._startRunCallbacks(result)
File "[..]/venv/lib/python3.9/site-packages/twisted/internet/defer.py", line 764, in _startRunCallbacks
self._runCallbacks()
File "[..]/venv/lib/python3.9/site-packages/twisted/internet/defer.py", line 858, in _runCallbacks
current.result = callback( # type: ignore[misc]
File "[..]/venv/lib/python3.9/site-packages/twisted/internet/defer.py", line 1751, in gotResult
current_context.run(_inlineCallbacks, r, gen, status)
--- <exception caught here> ---
File "[..]/venv/lib/python3.9/site-packages/twisted/internet/defer.py", line 1661, in _inlineCallbacks
result = current_context.run(gen.send, result)
builtins.StopIteration:
I need to this project as my hiring task and really stuck in this problem. Have you any idea to how to solve it?
The problem comes from using baseURL as global. In Flask there are a request and app context concepts. The scraping process can take time, so you can't run it within the request context due to timeout. So it's better to use a task manager for it. Like Redis or Celery. Just google "flask redis scrapy" or read this chapter from the Brilliant Flask tutorial by Miguel Grinberg