I am trying to do an API with Flask + Scrapy. The main point is to make a request on Scrapy when the user sends me a request with some information (like country, location, and job title) as parameters. I've tried to use scrapyrt but I could not make it receive parameters (it cannot be on metadata) and that's why I am trying with Flask now, but it keeps returning this error:
Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 2464, in __call__
return self.wsgi_app(environ, start_response)
File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 2450, in wsgi_app
response = self.handle_exception(e)
File "/opt/anaconda3/lib/python3.7/site-packages/flask_restful/__init__.py", line 272, in error_router
return original_handler(e)
File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 1867, in handle_exception
reraise(exc_type, exc_value, tb)
File "/opt/anaconda3/lib/python3.7/site-packages/flask/_compat.py", line 38, in reraise
raise value.with_traceback(tb)
File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 2447, in wsgi_app
response = self.full_dispatch_request()
File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 1952, in full_dispatch_request
rv = self.handle_user_exception(e)
File "/opt/anaconda3/lib/python3.7/site-packages/flask_restful/__init__.py", line 272, in error_router
return original_handler(e)
File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 1821, in handle_user_exception
reraise(exc_type, exc_value, tb)
File "/opt/anaconda3/lib/python3.7/site-packages/flask/_compat.py", line 38, in reraise
raise value.with_traceback(tb)
File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 1950, in full_dispatch_request
rv = self.dispatch_request()
File "/opt/anaconda3/lib/python3.7/site-packages/flask/app.py", line 1936, in dispatch_request
return self.view_functions[rule.endpoint](**req.view_args)
File "/opt/anaconda3/lib/python3.7/site-packages/flask_restful/__init__.py", line 468, in wrapper
resp = resource(*args, **kwargs)
File "/opt/anaconda3/lib/python3.7/site-packages/flask/views.py", line 89, in view
return self.dispatch_request(*args, **kwargs)
File "/opt/anaconda3/lib/python3.7/site-packages/flask_restful/__init__.py", line 583, in dispatch_request
resp = meth(*args, **kwargs)
File "/Users/gadgethub/Neobrain_git/Jobs_API/src/collector/api.py", line 30, in post
df = runner.run_indeed()
File "/Users/gadgethub/Neobrain_git/Jobs_API/src/collector/main.py", line 21, in run_indeed
df = scraper.run(job_title=self.job_title,country=self.country,location=self.location)
File "/Users/gadgethub/Neobrain_git/Jobs_API/src/collector/scraper/indeedjobs/indeed_scraper.py", line 122, in run
process = CrawlerProcess(settings=settings)
File "/opt/anaconda3/lib/python3.7/site-packages/scrapy/crawler.py", line 268, in __init__
install_shutdown_handlers(self._signal_shutdown)
File "/opt/anaconda3/lib/python3.7/site-packages/scrapy/utils/ossignal.py", line 22, in install_shutdown_handlers
reactor._handleSignals()
File "/opt/anaconda3/lib/python3.7/site-packages/twisted/internet/posixbase.py", line 295, in _handleSignals
_SignalReactorMixin._handleSignals(self)
File "/opt/anaconda3/lib/python3.7/site-packages/twisted/internet/base.py", line 1243, in _handleSignals
signal.signal(signal.SIGINT, self.sigInt)
File "/opt/anaconda3/lib/python3.7/signal.py", line 47, in signal
handler = _signal.signal(_enum_to_int(signalnum), _enum_to_int(handler))
ValueError: signal only works in main thread
This is my flask app:
from flask import Flask, request
from flask_restful import Api,Resource
import json
from main import Run
app = Flask(__name__)
app.config['SECRET_KEY'] = 'neobrain2020'
api = Api(app)
""" This is the main file that makes the API run."""
#______________________________________________________________________________________________________________
class GetJobs(Resource):
""" The request come from ip/skills and use spacy similarity to get the most similar jobs.
This class receive the request and parse the parameters and call the responsible method. """
def post(self):
json_data = request.get_json(force=True)
job_title = json_data['job_title']
country = json_data['country']
location = json_data['location']
#Instanciate the Run class
runner = Run(job_title=job_title,country=country,location=location)
df = runner.run_indeed()
df = df.append(runner.run_neuvoo())
return df.to_json()
#Set the url's
api.add_resource(GetJobs, "/jobs")
if __name__ == '__main__':
app.run(host='0.0.0.0', debug=True)
and the Run() class:
class Run():
def __init__(self, job_title, country, location):
self.job_title = job_title
self.location = location
self.now = datetime.now()
self.country = country
def run_indeed(self):
print("Running Indeed Scrapper")
#List of jobs. Even if its just 1 job, has to be a list.
scraper = IndeedScraper()
df = scraper.run(job_title=self.job_title,country=self.country,location=self.location)
#scraper.save(df)
return df
def run_neuvoo(self):
print("Running neuvoo Scrapper")
JOB_TITLES = self.job_title
LOCATIONS = self.country.upper() + "," + self.location
#arguments
args = {
'name':'neuvoo',
'jobs': JOB_TITLES,
'locations': LOCATIONS,
'output_file': '../../../data/raw/neuvoo/${TS}.csv.gz',
#...
}
collector = ApiNeuvoo(args)
df = collector.run()
return df
Can you help me solve this problem?
I managed to solve this problem by using CrawlerRunner
instead of CrawlerProcess
.
Here is the code:
runner = CrawlerRunner(settings=settings)
@defer.inlineCallbacks
def crawl():
yield runner.crawl(IndeedUsSpider, job_title=job_title, country=country, location=location)
reactor.stop()
crawl()
reactor.run()
Reference: