I accidentally changed something in my scrapy
settings (I was trying to debug
a spider by creating a runner.py
file), and I can't do anything now with scrapy. This is the error I am getting after running any scrapy
related command in the command prompt:
(scrape_virtual_workspace) C:\Users\Sophocles PC>cd projects
(scrape_virtual_workspace) C:\Users\Sophocles PC\projects>scrapy
Traceback (most recent call last):
File "C:\Anaconda\envs\scrape_virtual_workspace\Scripts\scrapy-script.py", line 10, in <module>
sys.exit(execute())
File "C:\Anaconda\envs\scrape_virtual_workspace\lib\site-packages\scrapy\cmdline.py", line 117, in execute
check_deprecated_settings(settings)
File "C:\Anaconda\envs\scrape_virtual_workspace\lib\site-packages\scrapy\settings\deprecated.py", line 22, in check_deprecated_settings
deprecated = [x for x in DEPRECATED_SETTINGS if settings.get(x[0], None) is not None]
File "C:\Anaconda\envs\scrape_virtual_workspace\lib\site-packages\scrapy\settings\deprecated.py", line 22, in <listcomp>
deprecated = [x for x in DEPRECATED_SETTINGS if settings.get(x[0], None) is not None]
AttributeError: 'NoneType' object has no attribute 'get'
(scrape_virtual_workspace) C:\Users\Sophocles PC\projects>
I tried fiddling with settings to correct this. But I can't seem to figure it out.
Can anyone help me with this?
below you can see deprecated.py:
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
DEPRECATED_SETTINGS = [
('TRACK_REFS', 'no longer needed (trackref is always enabled)'),
('RESPONSE_CLASSES', 'no longer supported'),
('DEFAULT_RESPONSE_ENCODING', 'no longer supported'),
('BOT_VERSION', 'no longer used (user agent defaults to Scrapy now)'),
('ENCODING_ALIASES', 'no longer needed (encoding discovery uses w3lib now)'),
('STATS_ENABLED', 'no longer supported (change STATS_CLASS instead)'),
('SQLITE_DB', 'no longer supported'),
('SELECTORS_BACKEND', 'use SCRAPY_SELECTORS_BACKEND environment variable instead'),
('AUTOTHROTTLE_MIN_DOWNLOAD_DELAY', 'use DOWNLOAD_DELAY instead'),
('AUTOTHROTTLE_MAX_CONCURRENCY', 'use CONCURRENT_REQUESTS_PER_DOMAIN instead'),
('AUTOTHROTTLE_MAX_CONCURRENCY', 'use CONCURRENT_REQUESTS_PER_DOMAIN instead'),
('REDIRECT_MAX_METAREFRESH_DELAY', 'use METAREFRESH_MAXDELAY instead'),
('LOG_UNSERIALIZABLE_REQUESTS', 'use SCHEDULER_DEBUG instead'),
]
def check_deprecated_settings(settings):
deprecated = [x for x in DEPRECATED_SETTINGS if settings[x[0]] is not None]
if deprecated:
msg = "You are using the following settings which are deprecated or obsolete"
msg += " (ask [email protected] for alternatives):"
msg = msg + "\n " + "\n ".join("%s: %s" % x for x in deprecated)
warnings.warn(msg, ScrapyDeprecationWarning)
below you can see settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for zalando project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'zalando'
SPIDER_MODULES = ['zalando.spiders']
NEWSPIDER_MODULE = 'zalando.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'zalando.middlewares.ZalandoSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'zalando.middlewares.NetAPorterMaleDownloaderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'za
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)lando.pipelines.ZalandoPipeline': 300,
#}
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
** scrapy-script.py"" file:
# -*- coding: utf-8 -*-
import re
import sys
from scrapy.cmdline import execute
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
sys.exit(execute())
cmdline.py
from __future__ import print_function
import sys, os
import optparse
import cProfile
import inspect
import pkg_resources
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError
from scrapy.utils.misc import walk_modules
from scrapy.utils.project import inside_project, get_project_settings
from scrapy.utils.python import garbage_collect
from scrapy.settings.deprecated import check_deprecated_settings
def _iter_command_classes(module_name):
# TODO: add `name` attribute to commands and and merge this function with
# scrapy.utils.spider.iter_spider_classes
for module in walk_modules(module_name):
for obj in vars(module).values():
if inspect.isclass(obj) and \
issubclass(obj, ScrapyCommand) and \
obj.__module__ == module.__name__ and \
not obj == ScrapyCommand:
yield obj
def _get_commands_from_module(module, inproject):
d = {}
for cmd in _iter_command_classes(module):
if inproject or not cmd.requires_project:
cmdname = cmd.__module__.split('.')[-1]
d[cmdname] = cmd()
return d
def _get_commands_from_entry_points(inproject, group='scrapy.commands'):
cmds = {}
for entry_point in pkg_resources.iter_entry_points(group):
obj = entry_point.load()
if inspect.isclass(obj):
cmds[entry_point.name] = obj()
else:
raise Exception("Invalid entry point %s" % entry_point.name)
return cmds
def _get_commands_dict(settings, inproject):
cmds = _get_commands_from_module('scrapy.commands', inproject)
cmds.update(_get_commands_from_entry_points(inproject))
cmds_module = settings['COMMANDS_MODULE']
if cmds_module:
cmds.update(_get_commands_from_module(cmds_module, inproject))
return cmds
def _pop_command_name(argv):
i = 0
for arg in argv[1:]:
if not arg.startswith('-'):
del argv[i]
return arg
i += 1
def _print_header(settings, inproject):
if inproject:
print("Scrapy %s - project: %s\n" % (scrapy.__version__, \
settings['BOT_NAME']))
else:
print("Scrapy %s - no active project\n" % scrapy.__version__)
def _print_commands(settings, inproject):
_print_header(settings, inproject)
print("Usage:")
print(" scrapy <command> [options] [args]\n")
print("Available commands:")
cmds = _get_commands_dict(settings, inproject)
for cmdname, cmdclass in sorted(cmds.items()):
print(" %-13s %s" % (cmdname, cmdclass.short_desc()))
if not inproject:
print()
print(" [ more ] More commands available when run from project directory")
print()
print('Use "scrapy <command> -h" to see more info about a command')
def _print_unknown_command(settings, cmdname, inproject):
_print_header(settings, inproject)
print("Unknown command: %s\n" % cmdname)
print('Use "scrapy" to see available commands')
def _run_print_help(parser, func, *a, **kw):
try:
func(*a, **kw)
except UsageError as e:
if str(e):
parser.error(str(e))
if e.print_help:
parser.print_help()
sys.exit(2)
def execute(argv=None, settings=None):
if argv is None:
argv = sys.argv
# --- backwards compatibility for scrapy.conf.settings singleton ---
if settings is None and 'scrapy.conf' in sys.modules:
from scrapy import conf
if hasattr(conf, 'settings'):
settings = conf.settings
# ------------------------------------------------------------------
if settings is None:
settings = get_project_settings()
# set EDITOR from environment if available
try:
editor = os.environ['EDITOR']
except KeyError: pass
else:
settings['EDITOR'] = editor
check_deprecated_settings(settings)
# --- backwards compatibility for scrapy.conf.settings singleton ---
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
with warnings.catch_warnings():
warnings.simplefilter("ignore", ScrapyDeprecationWarning)
from scrapy import conf
conf.settings = settings
# ------------------------------------------------------------------
inproject = inside_project()
cmds = _get_commands_dict(settings, inproject)
cmdname = _pop_command_name(argv)
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
conflict_handler='resolve')
if not cmdname:
_print_commands(settings, inproject)
sys.exit(0)
elif cmdname not in cmds:
_print_unknown_command(settings, cmdname, inproject)
sys.exit(2)
cmd = cmds[cmdname]
parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
parser.description = cmd.long_desc()
settings.setdict(cmd.default_settings, priority='command')
cmd.settings = settings
cmd.add_options(parser)
opts, args = parser.parse_args(args=argv[1:])
_run_print_help(parser, cmd.process_options, args, opts)
cmd.crawler_process = CrawlerProcess(settings)
_run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)
def _run_command(cmd, args, opts):
if opts.profile:
_run_command_profiled(cmd, args, opts)
else:
cmd.run(args, opts)
def _run_command_profiled(cmd, args, opts):
if opts.profile:
sys.stderr.write("scrapy: writing cProfile stats to %r\n" % opts.profile)
loc = locals()
p = cProfile.Profile()
p.runctx('cmd.run(args, opts)', globals(), loc)
if opts.profile:
p.dump_stats(opts.profile)
if __name__ == '__main__':
try:
execute()
finally:
# Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect()
# on exit: http://doc.pypy.org/en/latest/cpython_differences.html?highlight=gc.collect#differences-related-to-garbage-collection-strategies
garbage_collect()
``
[1]: https://i.sstatic.net/ELQdK.png
I downgraded my scrapy
version to 1.5.2, and somehow it worked. This is very strange and it seems to me that the newer versions have removed a couple of files (like deprecated.py). This was very annoying.