Search code examples
pythonscrapyuser-agentscrapy-splashsplash-js-render

Trying to fake and rotating user agents


I am trying to fake user agents as well as rotate them in Python.
I found a tutorial online about how to do this with Scrapy using scrapy-useragents package.
I scrape the webpage, https://www.whatsmyua.info/, in order to check my user agent to see if it is different then mine and if it rotates. Is it different then my actual user agent but it does not rotate it returns the same user agent each time, and I cannot figure out what is going wrong.

settings.py

BOT_NAME = 'scrapy_javascript'

SPIDER_MODULES = ['scrapy_javascript.spiders']
NEWSPIDER_MODULE = 'scrapy_javascript.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapy_javascript (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
        'scrapy_splash.SplashCookiesMiddleware': 723,
        'scrapy_splash.SplashMiddleware': 725,
        'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}

# -----------------------------------------------------------------------------
# USER AGENT
# -----------------------------------------------------------------------------

DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': 500,
}


USER_AGENTS = [
    ('Mozilla/5.0 (X11; Linux x86_64) '
     'AppleWebKit/537.36 (KHTML, like Gecko) '
     'Chrome/57.0.2987.110 '
     'Safari/537.36'),  # chrome
    ('Mozilla/5.0 (X11; Linux x86_64) '
     'AppleWebKit/537.36 (KHTML, like Gecko) '
     'Chrome/61.0.3163.79 '
     'Safari/537.36'),  # chrome
    ('Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) '
     'Gecko/20100101 '
     'Firefox/55.0'),  # firefox
    ('Mozilla/5.0 (X11; Linux x86_64) '
     'AppleWebKit/537.36 (KHTML, like Gecko) '
     'Chrome/61.0.3163.91 '
     'Safari/537.36'),  # chrome
    ('Mozilla/5.0 (X11; Linux x86_64) '
     'AppleWebKit/537.36 (KHTML, like Gecko) '
     'Chrome/62.0.3202.89 '
     'Safari/537.36'),  # chrome
    ('Mozilla/5.0 (X11; Linux x86_64) '
     'AppleWebKit/537.36 (KHTML, like Gecko) '
     'Chrome/63.0.3239.108 '
     'Safari/537.36'),  # chrome
]

SPLASH_URL = 'http://199.89.192.74:8050'


DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'

Solution

  • Figured it out by creating csv file with all my url's and they are paired with IP's and user agents and so every time I access the webpage I use those IP's and user agents. Then I had to override my spalsh_url in my spider that way my splash_url would be equal to the proxy I am using at that moment.

    SplashSpider.py

    import csv
    from scrapy.spiders import Spider
    from scrapy_splash import SplashRequest
    from ..items import GameItem
    
    # process the csv file so the url + ip address + useragent pairs are the same as defined in the file
    # returns a list of dictionaries, example:
    # [ {'url': 'http://www.starcitygames.com/catalog/category/Rivals%20of%20Ixalan',
    #    'ip': 'http://204.152.114.244:8050',
    #    'ua': "Mozilla/5.0 (BlackBerry; U; BlackBerry 9320; en-GB) AppleWebKit/534.11"},
    #    ...
    # ]
    def process_csv(csv_file):
        data = []
        reader = csv.reader(csv_file)
        next(reader)
        for fields in reader:
            if fields[0] != "":
                url = fields[0]
            else:
                continue # skip the whole row if the url column is empty
            if fields[1] != "":
                ip = "http://" + fields[1] + ":8050" # adding http and port because this is the needed scheme
            if fields[2] != "":
                useragent = fields[2]
            data.append({"url": url, "ip": ip, "ua": useragent})
        return data
    
    
    class MySpider(Spider):
        name = 'splash_spider'  # Name of Spider
    
        # notice that we don't need to define start_urls
        # just make sure to get all the urls you want to scrape inside start_requests function
    
        # getting all the url + ip address + useragent pairs then request them
        def start_requests(self):
    
            # get the file path of the csv file that contains the pairs from the settings.py
            with open(self.settings["PROXY_CSV_FILE"], mode="r") as csv_file:
               # requests is a list of dictionaries like this -> {url: str, ua: str, ip: str}
                requests = process_csv(csv_file)
    
            for req in requests:
                # no need to create custom middlewares
                # just pass useragent using the headers param, and pass proxy using the meta param
    
                yield SplashRequest(url=req["url"], callback=self.parse, args={"wait": 3},
                        headers={"User-Agent": req["ua"]},
                        splash_url = req["ip"],
                        )
    

    settings.py

    BOT_NAME = 'scrapy_javascript'
    
    SPIDER_MODULES = ['scrapy_javascript.spiders']
    NEWSPIDER_MODULE = 'scrapy_javascript.spiders'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # The path of the csv file that contains the pairs
    PROXY_CSV_FILE = "proxies.csv"
    
    DOWNLOADER_MIDDLEWARES = {
            'scrapy_splash.SplashCookiesMiddleware': 723,
            'scrapy_splash.SplashMiddleware': 725,
            'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
    }
    
    #SPLASH_URL = 'http://127.0.0.1:8050'
    
    #SPLASH_URL = 'http://localhost:8050'
    DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
    HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
    
    
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 16
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 60