Search code examples
pythonscrapytwistedweak-references

Twisted weak reference to non object python


I am currently building a scraper with scrapy which unfortunately fails and with the following error log. I tried running it with crawler runner and crawler process but both versions fail. I tried to figure out if i've used twisted incorrectly but i think i did it correctly.

2018-04-18 23:55:46 [twisted] CRITICAL: 
Traceback (most recent call last):
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/twisted/internet/defer.py", line 1386, in _inlineCallbacks
result = g.send(result)
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/scrapy/crawler.py", line 79, in crawl
self.spider = self._create_spider(*args, **kwargs)
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/scrapy/crawler.py", line 102, in _create_spider
return self.spidercls.from_crawler(self, *args, **kwargs)
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/scrapy/spiders/__init__.py", line 52, in from_crawler
spider._set_crawler(crawler)
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/scrapy/spiders/__init__.py", line 67, in _set_crawler
crawler.signals.connect(self.close, signals.spider_closed)
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/scrapy/signalmanager.py", line 26, in connect
return dispatcher.connect(receiver, signal, **kwargs)
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/pydispatch/dispatcher.py", line 130, in connect
receiver = saferef.safeRef(receiver, onDelete=_removeReceiver)
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/pydispatch/saferef.py", line 32, in safeRef
return weakref.ref(target, onDelete)
TypeError: cannot create weak reference to 'NoneType' object

My Code looks like this

import scrapy
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
from classes import cars

LINKS = []
CARS = []

class AutoSpiderLinks(scrapy.Spider):
    name = "Auto_get_links"
    ROOT_URL = "https://www.somewebsite"
    global LINKS

def geturls(self):
    main_url = "https://www.somewebsite"
    target_url = []
    for x in range(1, 2):
        target_url.append(main_url + "&page=" + str(x))
        print(target_url.append(main_url + "&page=" + str(x)))
    return target_url

def start_requests(self):
    urls = AutoSpiderLinks.geturls(self)
    for url in urls:
        yield scrapy.Request(url=url, callback=self.parse)

def parse(self, response):
    important_divs = response.css('div.cldt-summary-titles').extract()
    AutoSpiderLinks.convert(self, important_divs)

def main():
   configure_logging()
   runner = CrawlerRunner()
   runner.crawl(AutoSpiderLinks)
   runner.crawl(DeepSpider)
   d = runner.join()
   d.addBoth(lambda _: reactor.stop())

if __name__ == '__main__'
   main()

Thank you for your assistance


Solution

  • So i've figured it out. Apparently you can not configure the threading inside the main function. The fix for this issue was straight forward with reactor.

    @defer.inlineCallbacks
    def main():
        configure_logging()
        runner = CrawlerRunner()
        yield runner.crawl(AutoSpiderLinks)
        yield runner.crawl(DeepSpider)
        reactor.stop()
    
    
    if __name__ == '__main__':
        main()
        reactor.run()