Search code examples
pythonscrapytypeerrorscrapy-splash

TypeError: css() takes exactly 2 arguments (3 given)


Receiving the following error, TypeError: css() takes exactly 2 arguments (3 given), when trying to run the following web scraping program. I need the program to scrape these card names from the following website, http://www.starcitygames.com/catalog/category/Duel%20Decks%20Venser%20vs%20Koth and the problem is in order to scrape all the content I need I have to have two variables in the loop. But I do not know if I even set the loop up right for it to work simultaneously for it to look for both strings in the HTML code.

SplashSpider.py

import csv
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from ..items import GameItem

# process the csv file so the url + ip address + useragent pairs are the same as defined in the file # returns a list of dictionaries, example:
# [ {'url': 'http://www.starcitygames.com/catalog/category/Rivals%20of%20Ixalan',
#    'ip': 'http://204.152.114.244:8050',
#    'ua': "Mozilla/5.0 (BlackBerry; U; BlackBerry 9320; en-GB) AppleWebKit/534.11"},
#    ...
# ]
def process_csv(csv_file):
    data = []
    reader = csv.reader(csv_file)
    next(reader)
    for fields in reader:
        if fields[0] != "":
            url = fields[0]
        else:
            continue # skip the whole row if the url column is empty
        if fields[1] != "":
            ip = "http://" + fields[1] + ":8050" # adding http and port because this is the needed scheme
        if fields[2] != "":
            useragent = fields[2]
        data.append({"url": url, "ip": ip, "ua": useragent})
    return data


class MySpider(Spider):
    name = 'splash_spider'  # Name of Spider

    # notice that we don't need to define start_urls
    # just make sure to get all the urls you want to scrape inside start_requests function

    # getting all the url + ip address + useragent pairs then request them
    def start_requests(self):

        # get the file path of the csv file that contains the pairs from the settings.py
        with open(self.settings["PROXY_CSV_FILE"], mode="r") as csv_file:
           # requests is a list of dictionaries like this -> {url: str, ua: str, ip: str}
            requests = process_csv(csv_file)

        for req in requests:
            # no need to create custom middlewares
            # just pass useragent using the headers param, and pass proxy using the meta param

            yield SplashRequest(url=req["url"], callback=self.parse, args={"wait": 3},
                    headers={"User-Agent": req["ua"]},
                    splash_url = req["ip"],
                    )

    # Scraping
    def parse(self, response):
        item = GameItem()
        for game1,game2 in response.css("tr.deckdbbody_row","tr.deckdbbody2_row"):
            # Card Name
            item["card_name"] = game_1.css("a.card_popup::text").extract_first()
            item["card_name"] = game_2.css("a.card_popup::text").extract_first()

Error message

2019-06-04 10:36:39 [scrapy.core.scraper] ERROR: Spider error processing <GET http://www.starcitygames.com/catalog/category/Duel%20Decks%20Venser%20vs%20Koth via http://204.152.114.229:8050/render.html> (referer: None) Traceback (most recent call last):
  File "/home/trno224/.local/lib/python2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
    yield next(it)
  File "/home/trno224/.local/lib/python2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
    for x in result:
  File "/home/trno224/.local/lib/python2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr>
    return (_set_referer(r) for r in result or ())
  File "/home/trno224/.local/lib/python2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "/home/trno224/.local/lib/python2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "/home/trno224/scrapy_splash/scrapy_javascript/scrapy_javascript/spiders/SplashSpider.py", line 56, in parse
    for game1,game2 in response.css("tr.deckdbbody_row","tr.deckdbbody2_row"):
TypeError: css() takes exactly 2 arguments (3 given)
2019-06-04 10:36:39 [scrapy.core.engine] INFO: Closing spider (finished)
2019-06-04 10:36:39 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1149,
 'downloader/request_count': 3,
 'downloader/request_method_count/GET': 2,
 'downloader/request_method_count/POST': 1,
 'downloader/response_bytes': 131347,
 'downloader/response_count': 3,
 'downloader/response_status_count/200': 2,
 'downloader/response_status_count/404': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2019, 6, 4, 14, 36, 39, 136675),
 'log_count/DEBUG': 3,
 'log_count/ERROR': 1,
 'log_count/INFO': 10,
 'memusage/max': 55750656,
 'memusage/startup': 54394880,
 'response_received_count': 3,
 'robotstxt/request_count': 2,
 'robotstxt/response_count': 2,
 'robotstxt/response_status_count/200': 1,
 'robotstxt/response_status_count/404': 1,
 'scheduler/dequeued': 2,
 'scheduler/dequeued/memory': 2,
 'scheduler/enqueued': 2,
 'scheduler/enqueued/memory': 2,
 'spider_exceptions/TypeError': 1,
 'splash/render.html/request_count': 1,
 'splash/render.html/response_count/200': 1,
 'start_time': datetime.datetime(2019, 6, 4, 14, 35, 28, 761764)}

Solution

  • You need this CSS expression (similar to starts-with()):

    for game in response.css("tr[class^=deckdbbody]"):
        # Card Name
        item["card_name"] = game.css("a.card_popup::text").extract_first()