Search code examples
pythonflaskscrapy

ValueError: signal only works in main thread (Python / Scrapy + Flask )


I am trying to do an API with Flask + Scrapy. The main point is to make a request on Scrapy when the user sends me a request with some information (like country, location, and job title) as parameters. I've tried to use scrapyrt but I could not make it receive parameters (it cannot be on metadata) and that's why I am trying with Flask now, but it keeps returning this error:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 2464, in __call__
    return self.wsgi_app(environ, start_response)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 2450, in wsgi_app
    response = self.handle_exception(e)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask_restful/__init__.py", line 272, in error_router
    return original_handler(e)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 1867, in handle_exception
    reraise(exc_type, exc_value, tb)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/_compat.py", line 38, in reraise
    raise value.with_traceback(tb)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 2447, in wsgi_app
    response = self.full_dispatch_request()
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 1952, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask_restful/__init__.py", line 272, in error_router
    return original_handler(e)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 1821, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/_compat.py", line 38, in reraise
    raise value.with_traceback(tb)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 1950, in full_dispatch_request
    rv = self.dispatch_request()
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 1936, in dispatch_request
    return self.view_functions[rule.endpoint](**req.view_args)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask_restful/__init__.py", line 468, in wrapper
    resp = resource(*args, **kwargs)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask/views.py", line 89, in view
    return self.dispatch_request(*args, **kwargs)
  File "/opt/anaconda3/lib/python3.7/site-packages/flask_restful/__init__.py", line 583, in dispatch_request
    resp = meth(*args, **kwargs)
  File "/Users/gadgethub/Neobrain_git/Jobs_API/src/collector/api.py", line 30, in post
    df = runner.run_indeed()
  File "/Users/gadgethub/Neobrain_git/Jobs_API/src/collector/main.py", line 21, in run_indeed
    df = scraper.run(job_title=self.job_title,country=self.country,location=self.location)
  File "/Users/gadgethub/Neobrain_git/Jobs_API/src/collector/scraper/indeedjobs/indeed_scraper.py", line 122, in run
    process = CrawlerProcess(settings=settings)
  File "/opt/anaconda3/lib/python3.7/site-packages/scrapy/crawler.py", line 268, in __init__
    install_shutdown_handlers(self._signal_shutdown)
  File "/opt/anaconda3/lib/python3.7/site-packages/scrapy/utils/ossignal.py", line 22, in install_shutdown_handlers
    reactor._handleSignals()
  File "/opt/anaconda3/lib/python3.7/site-packages/twisted/internet/posixbase.py", line 295, in _handleSignals
    _SignalReactorMixin._handleSignals(self)
  File "/opt/anaconda3/lib/python3.7/site-packages/twisted/internet/base.py", line 1243, in _handleSignals
    signal.signal(signal.SIGINT, self.sigInt)
  File "/opt/anaconda3/lib/python3.7/signal.py", line 47, in signal
    handler = _signal.signal(_enum_to_int(signalnum), _enum_to_int(handler))
ValueError: signal only works in main thread 

This is my flask app:

from flask import Flask, request
from flask_restful import Api,Resource
import json
from main import Run


app = Flask(__name__)
app.config['SECRET_KEY'] = 'neobrain2020'
api = Api(app)

""" This is the main file that makes the API run."""



#______________________________________________________________________________________________________________


class GetJobs(Resource):
    """ The request come from ip/skills and use spacy similarity to get the most similar jobs.
    This class receive the request and parse the parameters and call the responsible method. """
    def post(self):
        json_data = request.get_json(force=True)
        job_title = json_data['job_title']
        country = json_data['country']
        location = json_data['location']

        #Instanciate the Run class
        runner = Run(job_title=job_title,country=country,location=location)

        df = runner.run_indeed()

        df = df.append(runner.run_neuvoo())

        return df.to_json()


#Set the url's
api.add_resource(GetJobs, "/jobs")

if __name__ == '__main__':
    app.run(host='0.0.0.0', debug=True)

and the Run() class:

class Run():
    def __init__(self, job_title, country, location):
        self.job_title = job_title
        self.location = location
        self.now = datetime.now()
        self.country = country

    def run_indeed(self):
        print("Running Indeed Scrapper")

        #List of jobs. Even if its just 1 job, has to be a list.
        scraper = IndeedScraper()

        df = scraper.run(job_title=self.job_title,country=self.country,location=self.location)
        #scraper.save(df)
        return df


    def run_neuvoo(self):
        print("Running neuvoo Scrapper")

        JOB_TITLES = self.job_title
        LOCATIONS = self.country.upper() + "," + self.location

        #arguments
        args = {
            'name':'neuvoo',
            'jobs': JOB_TITLES,
            'locations': LOCATIONS,
            'output_file': '../../../data/raw/neuvoo/${TS}.csv.gz',
            #...
        }

        collector = ApiNeuvoo(args)
        df = collector.run()
        return df

Can you help me solve this problem?


Solution

  • I managed to solve this problem by using CrawlerRunner instead of CrawlerProcess.

    Here is the code:

    runner = CrawlerRunner(settings=settings)
    
    @defer.inlineCallbacks
    def crawl():
       yield runner.crawl(IndeedUsSpider, job_title=job_title, country=country, location=location)
       reactor.stop()
    
            
    crawl()
    
    reactor.run()
    

    Reference: