python flask scrapy twisted twisted.internet

Running Scrapy with Flask - Unhandled error in EventualResult

I have a scrapy project that works fine and returns resutls i want. The spider code is:

class ExampleSpider(scrapy.Spider):
    name = 'example'
    myBaseUrl = ''
    start_urls = []
    def __init__(self, symbol_url='', **kwargs): 
        super().__init__(**kwargs)
        self.myBaseUrl = symbol_url
        self.start_urls.append(self.myBaseUrl)

    custom_settings = {'FEED_URI': 'output/output.json', 'CLOSESPIDER_TIMEOUT' : 50} 

    def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(url, self.parse, args={'wait': 5})


    def parse(self, response):
    
        tbody = response.css('tbody.scrollContent')
        row = tbody.css('tr.table__row.ng-scope')
        col = row.css('td')
        links = col.css('a.letter-title.ng-binding.ng-scope::attr(href)')
        for link in links:
        yield response.follow("https://www.example.ir"+link.get(), callback=self.parse_page)


    def parse_page(self, response):

        page = bs(response.text, 'html.parser')
        name = page.find('head', id='Head1').text.strip().split('(')[-1]
        name = name.replace(')', '').strip()
        try:
            tables = page.find_all('tr', class_="ComputationalRow")
            table_head = page.find('table', {'id':'exampleID'})
            date = table_head.findAll('th')[-1].text.split(' ')[3]
            yield MyItem(name=name, date=date)
        except:
            pass

And here is my Flask code to run the spider:

import crochet
crochet.setup()
from flask import Flask , render_template, jsonify, request, redirect, url_for
from scrapy import signals
from scrapy.crawler import CrawlerRunner
from scrapy.signalmanager import dispatcher
import time
import os
from EScrapy.example.spiders.example import ExampleSpider

app = Flask(__name__)

output_data = []
crawl_runner = CrawlerRunner()

@app.route('/')
def index():
    return render_template("index.html")

@app.route('/', methods=['POST'])
def submit():
    if request.method == 'POST':
        s = request.form['symbol'] 
        global baseURL
        baseURL = f'https://example.ir/ReportList.aspx?search&Symbol={s}'
        if os.path.exists("output/output.json"): 
            os.remove("output/output.json")
        return redirect(url_for('scrape'))

@app.route("/scrape")
def scrape():
    scrape_with_crochet(baseURL=baseURL)
    time.sleep(20)
    return jsonify(output_data)

@crochet.run_in_reactor
@crochet.wait_for(timeout=10)
def scrape_with_crochet(baseURL):
    dispatcher.connect(_crawler_result, signal=signals.item_scraped)
    eventual = crawl_runner.crawl(ExampleSpider, symbol_url = baseURL)
    return eventual

def _crawler_result(item, response, spider):
    output_data.append(dict(item))


if __name__== "__main__":
    app.run(debug=True)

My spider works properly alone when put the link in start_urls and delete __init__ method. It returns json file but when I want to run the code from flask application i saw this errors in my flask debug mode in commandline and Flask returns empty list:

Unhandled error in EventualResult
Traceback (most recent call last):
   File "[..]/venv/lib/python3.9/site-packages/twisted/internet/defer.py", line 662, in callback
self._startRunCallbacks(result)
   File "[..]/venv/lib/python3.9/site-packages/twisted/internet/defer.py", line 764, in _startRunCallbacks
self._runCallbacks()
  File "[..]/venv/lib/python3.9/site-packages/twisted/internet/defer.py", line 858, in _runCallbacks
   current.result = callback(  # type: ignore[misc]
  File "[..]/venv/lib/python3.9/site-packages/twisted/internet/defer.py", line 1751, in gotResult
current_context.run(_inlineCallbacks, r, gen, status)
--- <exception caught here> ---
  File "[..]/venv/lib/python3.9/site-packages/twisted/internet/defer.py", line 1661, in _inlineCallbacks
    result = current_context.run(gen.send, result)
builtins.StopIteration:

I need to this project as my hiring task and really stuck in this problem. Have you any idea to how to solve it?

Solution

The problem comes from using baseURL as global. In Flask there are a request and app context concepts. The scraping process can take time, so you can't run it within the request context due to timeout. So it's better to use a task manager for it. Like Redis or Celery. Just google "flask redis scrapy" or read this chapter from the Brilliant Flask tutorial by Miguel Grinberg