Error after running .exe file originating from scrapy project

I am writing a scrapy project which works perfectly. I have converted it to an executable using pyinstaller. Now I was expecting some trouble as to importing modules as I have read that a lot of people are having trouble with that. But for some reason I don't even get that far. As soon as I run the main.exe file, the console opens up and shows the following message:

Traceback (most recent call last): File "rascraper\main.py", line 1,

This is the corresponding main.py file

from rascraper.spiders.spiderone import PostsSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings


def main():

    process = CrawlerProcess(get_project_settings())
    process.crawl(PostsSpider)
    process.start()



if __name__ == '__main__':
    main()

And This is my spider class

import scrapy


class PostsSpider(scrapy.Spider):
    name = 'posts'

    # artist = input(f'Artist Name:')
    # filter = input(f'filter on Country? (y/n):')
    #
    # if filter == 'y':
    #     country = input(f'Country:')
    #     start_urls = [
    #         f'https://ra.co/dj/{artist}/past-events?country={country}'
    #     ]
    #
    # elif filter == 'n':
    #     start_urls = [
    #         f'https://ra.co/dj/{artist}/past-events'
    #     ]

    HEADERS = {
        'accept': '/*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'nl-NL,nl;q=0.9,en-US;q=0.8,en;q=0.7,fr;q=0.6',
        'authorization': 'df67dacc9c704696b908a618dd4f59be',
        'cache-control': 'max-age=0',
        'content-type': 'application/json',
        'origin': 'https://ra.co',
        'referer': 'https://ra.co/',
        'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': 'Windows',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-site',
    }


    def parse(self, response):

        for post in response.css('li.Column-sc-18hsrnn-0.inVJeD'):

            date = post.css('.Text-sc-1t0gn2o-0.jmZufm::text').get()
            event = post.css('.Text-sc-1t0gn2o-0.Link__StyledLink-k7o46r-0.dXQVFW::text').get()
            location = post.css('.Text-sc-1t0gn2o-0.Link__StyledLink-k7o46r-0.echVma::text').get()
            venue = post.css('.Text-sc-1t0gn2o-0.Link__StyledLink-k7o46r-0.dxNiKF::text').get()
            acts = post.css('.Text-sc-1t0gn2o-0.bYvpkM::text').get()

            item = {}
            item['Date'] = date
            item['Event'] = event
            item['Location'] = location
            item['Venue'] = venue
            item['Acts'] = acts

            yield item

Where does this error come from and how can I solve it?

Solution

Making a standalone executable from scrapy project with PyInstaller

In order to create a single executable file you'll need to do the following steps:

Add this to all of your spiders (source):

import scrapy.utils.misc
import scrapy.core.scraper


def warn_on_generator_with_return_value_stub(spider, callable):
    pass


scrapy.utils.misc.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
scrapy.core.scraper.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub

So in my example spider.py will look like this:

import scrapy
import scrapy.utils.misc
import scrapy.core.scraper


def warn_on_generator_with_return_value_stub(spider, callable):
    pass


scrapy.utils.misc.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
scrapy.core.scraper.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub


class ExampleSpider(scrapy.Spider):
    name = 'example_spider'
    allowed_domains = ['scrapingclub.com']
    start_urls = ['https://scrapingclub.com/exercise/detail_basic/']

    def parse(self, response):
        item = dict()
        item['title'] = response.xpath('//h3/text()').get()
        item['price'] = response.xpath('//div[@class="card-body"]/h4/text()').get()
        yield item

Add this to main.py (if you don't add this then you'll get error whenever you try to run the executable from a directory outside of your project's directory):

import os

os.environ.setdefault('SCRAPY_SETTINGS_MODULE', PATH_TO_SETTINGS)

In this example main.py:

import os
from rascraper.spiders.spider import ExampleSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings


def main():
    os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'settings')
    process = CrawlerProcess(get_project_settings())
    process.crawl(ExampleSpider)
    process.start()


if __name__ == '__main__':
    main()

Run pyinstaller to generate a spec file: python -m PyInstaller --onefile --name example_exe main.py.
Change the spec file: Add all the files in your project to datas list.

Before:

# -*- mode: python ; coding: utf-8 -*-


block_cipher = None


a = Analysis(['main.py'],
             pathex=[],
             binaries=[],
             datas=[],
             hiddenimports=[],
             hookspath=[],
             hooksconfig={},
             runtime_hooks=[],
             excludes=[],
             win_no_prefer_redirects=False,
             win_private_assemblies=False,
             cipher=block_cipher,
             noarchive=False)
pyz = PYZ(a.pure, a.zipped_data,
             cipher=block_cipher)

exe = EXE(pyz,
          a.scripts,
          a.binaries,
          a.zipfiles,
          a.datas,  
          [],
          name='example_exe',
          debug=False,
          bootloader_ignore_signals=False,
          strip=False,
          upx=True,
          upx_exclude=[],
          runtime_tmpdir=None,
          console=True,
          disable_windowed_traceback=False,
          target_arch=None,
          codesign_identity=None,
          entitlements_file=None )

After:

# -*- mode: python ; coding: utf-8 -*-


block_cipher = None


a = Analysis(['main.py'],
             pathex=[],
             binaries=[],
             datas=[('items.py','.'),
                    ('middlewares.py','.'),
                    ('pipelines.py','.'),
                    ('settings.py','.'),
                    ('spiders','spiders'),
                    ('..\\scrapy.cfg', '.')],
             hiddenimports=[],
             hookspath=[],
             hooksconfig={},
             runtime_hooks=[],
             excludes=[],
             win_no_prefer_redirects=False,
             win_private_assemblies=False,
             cipher=block_cipher,
             noarchive=False)
pyz = PYZ(a.pure, a.zipped_data,
             cipher=block_cipher)

exe = EXE(pyz,
          a.scripts,
          a.binaries,
          a.zipfiles,
          a.datas,  
          [],
          name='example_exe',
          debug=False,
          bootloader_ignore_signals=False,
          strip=False,
          upx=True,
          upx_exclude=[],
          runtime_tmpdir=None,
          console=True,
          disable_windowed_traceback=False,
          target_arch=None,
          codesign_identity=None,
          entitlements_file=None )

Build the spec file: python -m PyInstaller example_exe.spec

Result:

Now should have a standalone executable that you can run in any directory:

C:\Users\MY_USER\Desktop>example_exe.exe

...
...
[scrapy.core.engine] INFO: Spider opened
[scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
[scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
[scrapy.core.engine] DEBUG: Crawled (404) <GET https://scrapingclub.com/robots.txt> (referer: None)
[scrapy.core.engine] DEBUG: Crawled (200) <GET https://scrapingclub.com/exercise/detail_basic/> (referer: None)
[scrapy.core.scraper] DEBUG: Scraped from <200 https://scrapingclub.com/exercise/detail_basic/>
{'title': 'Long-sleeved Jersey Top', 'price': '$12.99'}
[scrapy.core.engine] INFO: Closing spider (finished)
[scrapy.statscollectors] INFO: Dumping Scrapy stats:
...
...

Specifically for OP's project:

The project tree looks like this:

C:.
│   main.py
│   scrapy.cfg
│
└───rascraper
    │   items.py
    │   middlewares.py
    │   pipelines.py
    │   settings.py
    │   __init__.py
    │
    ├───spiders
    │   │   spiderone.py
    │   │   __init__.py
    │   │
    │   └───__pycache__
    │           spiderone.cpython-310.pyc
    │           __init__.cpython-310.pyc
    │
    └───__pycache__
            middlewares.cpython-310.pyc
            pipelines.cpython-310.pyc
            settings.cpython-310.pyc
            __init__.cpython-310.pyc

So the datas list should be:

datas=[('rascraper\\items.py', '.'),
       ('rascraper\\middlewares.py', '.'),
        ('rascraper\\pipelines.py', '.'),
        ('rascraper\\settings.py', '.'),
        ('rascraper\\spiders', 'spiders'),
        ('scrapy.cfg', '.')],

Correction: In main.py it should just be os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'settings').