Search code examples
pythonscrapypyinstaller

Error after running .exe file originating from scrapy project


I am writing a scrapy project which works perfectly. I have converted it to an executable using pyinstaller. Now I was expecting some trouble as to importing modules as I have read that a lot of people are having trouble with that. But for some reason I don't even get that far. As soon as I run the main.exe file, the console opens up and shows the following message:

Traceback (most recent call last): File "rascraper\main.py", line 1,

This is the corresponding main.py file

from rascraper.spiders.spiderone import PostsSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings


def main():

    process = CrawlerProcess(get_project_settings())
    process.crawl(PostsSpider)
    process.start()



if __name__ == '__main__':
    main()

And This is my spider class

import scrapy


class PostsSpider(scrapy.Spider):
    name = 'posts'

    # artist = input(f'Artist Name:')
    # filter = input(f'filter on Country? (y/n):')
    #
    # if filter == 'y':
    #     country = input(f'Country:')
    #     start_urls = [
    #         f'https://ra.co/dj/{artist}/past-events?country={country}'
    #     ]
    #
    # elif filter == 'n':
    #     start_urls = [
    #         f'https://ra.co/dj/{artist}/past-events'
    #     ]

    HEADERS = {
        'accept': '/*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'nl-NL,nl;q=0.9,en-US;q=0.8,en;q=0.7,fr;q=0.6',
        'authorization': 'df67dacc9c704696b908a618dd4f59be',
        'cache-control': 'max-age=0',
        'content-type': 'application/json',
        'origin': 'https://ra.co',
        'referer': 'https://ra.co/',
        'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': 'Windows',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-site',
    }


    def parse(self, response):

        for post in response.css('li.Column-sc-18hsrnn-0.inVJeD'):

            date = post.css('.Text-sc-1t0gn2o-0.jmZufm::text').get()
            event = post.css('.Text-sc-1t0gn2o-0.Link__StyledLink-k7o46r-0.dXQVFW::text').get()
            location = post.css('.Text-sc-1t0gn2o-0.Link__StyledLink-k7o46r-0.echVma::text').get()
            venue = post.css('.Text-sc-1t0gn2o-0.Link__StyledLink-k7o46r-0.dxNiKF::text').get()
            acts = post.css('.Text-sc-1t0gn2o-0.bYvpkM::text').get()

            item = {}
            item['Date'] = date
            item['Event'] = event
            item['Location'] = location
            item['Venue'] = venue
            item['Acts'] = acts

            yield item

Where does this error come from and how can I solve it?


Solution

  • Making a standalone executable from scrapy project with PyInstaller

    In order to create a single executable file you'll need to do the following steps:

    1. Add this to all of your spiders (source):
    import scrapy.utils.misc
    import scrapy.core.scraper
    
    
    def warn_on_generator_with_return_value_stub(spider, callable):
        pass
    
    
    scrapy.utils.misc.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
    scrapy.core.scraper.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
    

    So in my example spider.py will look like this:

    import scrapy
    import scrapy.utils.misc
    import scrapy.core.scraper
    
    
    def warn_on_generator_with_return_value_stub(spider, callable):
        pass
    
    
    scrapy.utils.misc.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
    scrapy.core.scraper.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
    
    
    class ExampleSpider(scrapy.Spider):
        name = 'example_spider'
        allowed_domains = ['scrapingclub.com']
        start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
    
        def parse(self, response):
            item = dict()
            item['title'] = response.xpath('//h3/text()').get()
            item['price'] = response.xpath('//div[@class="card-body"]/h4/text()').get()
            yield item
    
    1. Add this to main.py (if you don't add this then you'll get error whenever you try to run the executable from a directory outside of your project's directory):
    import os
    
    os.environ.setdefault('SCRAPY_SETTINGS_MODULE', PATH_TO_SETTINGS)
    

    In this example main.py:

    import os
    from rascraper.spiders.spider import ExampleSpider
    from scrapy.crawler import CrawlerProcess
    from scrapy.utils.project import get_project_settings
    
    
    def main():
        os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'settings')
        process = CrawlerProcess(get_project_settings())
        process.crawl(ExampleSpider)
        process.start()
    
    
    if __name__ == '__main__':
        main()
    
    1. Run pyinstaller to generate a spec file: python -m PyInstaller --onefile --name example_exe main.py.

    2. Change the spec file: Add all the files in your project to datas list.

    Before:

    # -*- mode: python ; coding: utf-8 -*-
    
    
    block_cipher = None
    
    
    a = Analysis(['main.py'],
                 pathex=[],
                 binaries=[],
                 datas=[],
                 hiddenimports=[],
                 hookspath=[],
                 hooksconfig={},
                 runtime_hooks=[],
                 excludes=[],
                 win_no_prefer_redirects=False,
                 win_private_assemblies=False,
                 cipher=block_cipher,
                 noarchive=False)
    pyz = PYZ(a.pure, a.zipped_data,
                 cipher=block_cipher)
    
    exe = EXE(pyz,
              a.scripts,
              a.binaries,
              a.zipfiles,
              a.datas,  
              [],
              name='example_exe',
              debug=False,
              bootloader_ignore_signals=False,
              strip=False,
              upx=True,
              upx_exclude=[],
              runtime_tmpdir=None,
              console=True,
              disable_windowed_traceback=False,
              target_arch=None,
              codesign_identity=None,
              entitlements_file=None )
    

    After:

    # -*- mode: python ; coding: utf-8 -*-
    
    
    block_cipher = None
    
    
    a = Analysis(['main.py'],
                 pathex=[],
                 binaries=[],
                 datas=[('items.py','.'),
                        ('middlewares.py','.'),
                        ('pipelines.py','.'),
                        ('settings.py','.'),
                        ('spiders','spiders'),
                        ('..\\scrapy.cfg', '.')],
                 hiddenimports=[],
                 hookspath=[],
                 hooksconfig={},
                 runtime_hooks=[],
                 excludes=[],
                 win_no_prefer_redirects=False,
                 win_private_assemblies=False,
                 cipher=block_cipher,
                 noarchive=False)
    pyz = PYZ(a.pure, a.zipped_data,
                 cipher=block_cipher)
    
    exe = EXE(pyz,
              a.scripts,
              a.binaries,
              a.zipfiles,
              a.datas,  
              [],
              name='example_exe',
              debug=False,
              bootloader_ignore_signals=False,
              strip=False,
              upx=True,
              upx_exclude=[],
              runtime_tmpdir=None,
              console=True,
              disable_windowed_traceback=False,
              target_arch=None,
              codesign_identity=None,
              entitlements_file=None )
    
    
    1. Build the spec file: python -m PyInstaller example_exe.spec

    Result:

    Now should have a standalone executable that you can run in any directory:

    C:\Users\MY_USER\Desktop>example_exe.exe
    
    ...
    ...
    [scrapy.core.engine] INFO: Spider opened
    [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
    [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
    [scrapy.core.engine] DEBUG: Crawled (404) <GET https://scrapingclub.com/robots.txt> (referer: None)
    [scrapy.core.engine] DEBUG: Crawled (200) <GET https://scrapingclub.com/exercise/detail_basic/> (referer: None)
    [scrapy.core.scraper] DEBUG: Scraped from <200 https://scrapingclub.com/exercise/detail_basic/>
    {'title': 'Long-sleeved Jersey Top', 'price': '$12.99'}
    [scrapy.core.engine] INFO: Closing spider (finished)
    [scrapy.statscollectors] INFO: Dumping Scrapy stats:
    ...
    ...
    
    

    Specifically for OP's project:

    The project tree looks like this:

    C:.
    │   main.py
    │   scrapy.cfg
    │
    └───rascraper
        │   items.py
        │   middlewares.py
        │   pipelines.py
        │   settings.py
        │   __init__.py
        │
        ├───spiders
        │   │   spiderone.py
        │   │   __init__.py
        │   │
        │   └───__pycache__
        │           spiderone.cpython-310.pyc
        │           __init__.cpython-310.pyc
        │
        └───__pycache__
                middlewares.cpython-310.pyc
                pipelines.cpython-310.pyc
                settings.cpython-310.pyc
                __init__.cpython-310.pyc
    

    So the datas list should be:

    datas=[('rascraper\\items.py', '.'),
           ('rascraper\\middlewares.py', '.'),
            ('rascraper\\pipelines.py', '.'),
            ('rascraper\\settings.py', '.'),
            ('rascraper\\spiders', 'spiders'),
            ('scrapy.cfg', '.')],
    

    Correction: In main.py it should just be os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'settings').