I am writing a scrapy project which works perfectly. I have converted it to an executable using pyinstaller. Now I was expecting some trouble as to importing modules as I have read that a lot of people are having trouble with that. But for some reason I don't even get that far. As soon as I run the main.exe file, the console opens up and shows the following message:
Traceback (most recent call last): File "rascraper\main.py", line 1,
This is the corresponding main.py file
from rascraper.spiders.spiderone import PostsSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def main():
process = CrawlerProcess(get_project_settings())
process.crawl(PostsSpider)
process.start()
if __name__ == '__main__':
main()
And This is my spider class
import scrapy
class PostsSpider(scrapy.Spider):
name = 'posts'
# artist = input(f'Artist Name:')
# filter = input(f'filter on Country? (y/n):')
#
# if filter == 'y':
# country = input(f'Country:')
# start_urls = [
# f'https://ra.co/dj/{artist}/past-events?country={country}'
# ]
#
# elif filter == 'n':
# start_urls = [
# f'https://ra.co/dj/{artist}/past-events'
# ]
HEADERS = {
'accept': '/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'nl-NL,nl;q=0.9,en-US;q=0.8,en;q=0.7,fr;q=0.6',
'authorization': 'df67dacc9c704696b908a618dd4f59be',
'cache-control': 'max-age=0',
'content-type': 'application/json',
'origin': 'https://ra.co',
'referer': 'https://ra.co/',
'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'Windows',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
}
def parse(self, response):
for post in response.css('li.Column-sc-18hsrnn-0.inVJeD'):
date = post.css('.Text-sc-1t0gn2o-0.jmZufm::text').get()
event = post.css('.Text-sc-1t0gn2o-0.Link__StyledLink-k7o46r-0.dXQVFW::text').get()
location = post.css('.Text-sc-1t0gn2o-0.Link__StyledLink-k7o46r-0.echVma::text').get()
venue = post.css('.Text-sc-1t0gn2o-0.Link__StyledLink-k7o46r-0.dxNiKF::text').get()
acts = post.css('.Text-sc-1t0gn2o-0.bYvpkM::text').get()
item = {}
item['Date'] = date
item['Event'] = event
item['Location'] = location
item['Venue'] = venue
item['Acts'] = acts
yield item
Where does this error come from and how can I solve it?
Making a standalone executable from scrapy project with PyInstaller
In order to create a single executable file you'll need to do the following steps:
import scrapy.utils.misc
import scrapy.core.scraper
def warn_on_generator_with_return_value_stub(spider, callable):
pass
scrapy.utils.misc.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
scrapy.core.scraper.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
So in my example spider.py
will look like this:
import scrapy
import scrapy.utils.misc
import scrapy.core.scraper
def warn_on_generator_with_return_value_stub(spider, callable):
pass
scrapy.utils.misc.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
scrapy.core.scraper.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
class ExampleSpider(scrapy.Spider):
name = 'example_spider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
def parse(self, response):
item = dict()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[@class="card-body"]/h4/text()').get()
yield item
main.py
(if you don't add this then you'll get error whenever you try to run the executable from a directory outside of your project's directory):import os
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', PATH_TO_SETTINGS)
In this example main.py
:
import os
from rascraper.spiders.spider import ExampleSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def main():
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'settings')
process = CrawlerProcess(get_project_settings())
process.crawl(ExampleSpider)
process.start()
if __name__ == '__main__':
main()
Run pyinstaller to generate a spec file: python -m PyInstaller --onefile --name example_exe main.py
.
Change the spec file: Add all the files in your project to datas
list.
Before:
# -*- mode: python ; coding: utf-8 -*-
block_cipher = None
a = Analysis(['main.py'],
pathex=[],
binaries=[],
datas=[],
hiddenimports=[],
hookspath=[],
hooksconfig={},
runtime_hooks=[],
excludes=[],
win_no_prefer_redirects=False,
win_private_assemblies=False,
cipher=block_cipher,
noarchive=False)
pyz = PYZ(a.pure, a.zipped_data,
cipher=block_cipher)
exe = EXE(pyz,
a.scripts,
a.binaries,
a.zipfiles,
a.datas,
[],
name='example_exe',
debug=False,
bootloader_ignore_signals=False,
strip=False,
upx=True,
upx_exclude=[],
runtime_tmpdir=None,
console=True,
disable_windowed_traceback=False,
target_arch=None,
codesign_identity=None,
entitlements_file=None )
After:
# -*- mode: python ; coding: utf-8 -*-
block_cipher = None
a = Analysis(['main.py'],
pathex=[],
binaries=[],
datas=[('items.py','.'),
('middlewares.py','.'),
('pipelines.py','.'),
('settings.py','.'),
('spiders','spiders'),
('..\\scrapy.cfg', '.')],
hiddenimports=[],
hookspath=[],
hooksconfig={},
runtime_hooks=[],
excludes=[],
win_no_prefer_redirects=False,
win_private_assemblies=False,
cipher=block_cipher,
noarchive=False)
pyz = PYZ(a.pure, a.zipped_data,
cipher=block_cipher)
exe = EXE(pyz,
a.scripts,
a.binaries,
a.zipfiles,
a.datas,
[],
name='example_exe',
debug=False,
bootloader_ignore_signals=False,
strip=False,
upx=True,
upx_exclude=[],
runtime_tmpdir=None,
console=True,
disable_windowed_traceback=False,
target_arch=None,
codesign_identity=None,
entitlements_file=None )
python -m PyInstaller example_exe.spec
Result:
Now should have a standalone executable that you can run in any directory:
C:\Users\MY_USER\Desktop>example_exe.exe
...
...
[scrapy.core.engine] INFO: Spider opened
[scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
[scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
[scrapy.core.engine] DEBUG: Crawled (404) <GET https://scrapingclub.com/robots.txt> (referer: None)
[scrapy.core.engine] DEBUG: Crawled (200) <GET https://scrapingclub.com/exercise/detail_basic/> (referer: None)
[scrapy.core.scraper] DEBUG: Scraped from <200 https://scrapingclub.com/exercise/detail_basic/>
{'title': 'Long-sleeved Jersey Top', 'price': '$12.99'}
[scrapy.core.engine] INFO: Closing spider (finished)
[scrapy.statscollectors] INFO: Dumping Scrapy stats:
...
...
Specifically for OP's project:
The project tree looks like this:
C:.
│ main.py
│ scrapy.cfg
│
└───rascraper
│ items.py
│ middlewares.py
│ pipelines.py
│ settings.py
│ __init__.py
│
├───spiders
│ │ spiderone.py
│ │ __init__.py
│ │
│ └───__pycache__
│ spiderone.cpython-310.pyc
│ __init__.cpython-310.pyc
│
└───__pycache__
middlewares.cpython-310.pyc
pipelines.cpython-310.pyc
settings.cpython-310.pyc
__init__.cpython-310.pyc
So the datas
list should be:
datas=[('rascraper\\items.py', '.'),
('rascraper\\middlewares.py', '.'),
('rascraper\\pipelines.py', '.'),
('rascraper\\settings.py', '.'),
('rascraper\\spiders', 'spiders'),
('scrapy.cfg', '.')],
Correction: In main.py
it should just be os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'settings')
.