Search code examples
python-3.xseleniumpython-multithreadingthreadpoolexecutorselenium-webdriver-python

Unexpected Multithreading Output when Web Scraping with Selenium (Python)


from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor
import time

# Current time is :48.77885s per Page, 4.4344 per Region
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)


def list_to_csv(summoner_info):
    summoner_info = set([tuple(summoner) for summoner in summoner_info])
    with open('high_elo_summoners.csv', 'w', encoding='utf-8') as f:
        for summoner in summoner_info:
            f.write(f"{summoner[0]},{summoner[1]},{summoner[2]}\n")


def gather_summoner_info(url):
    driver.get(url)
    driver.implicitly_wait(5) # Wait until the CSS Selector is available

    summoner_info = []
    content = driver.find_elements(By.CLASS_NAME, 'rt-tr')
    for index, con in enumerate(content):
        if index != 0:
            summoner = con.text.split('\n')
            summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(' ')[0].replace(',', ''))])
        else:
            pass
    return summoner_info


def get_summoner_data(page_count, regions):
    links = [f'https://u.gg/leaderboards/ranking?region={region}&page={page + 1}' for page in range(page_count) for
             region in regions]

    # Gather all the relevant summoner information on the page
    agg_summoner_info = []
    with ThreadPoolExecutor(max_workers=20) as executor:
            future_results = {url : executor.submit(gather_summoner_info, url) for url in links}
            for url, future in future_results.items():
                #print(future.result())
                agg_summoner_info.extend(future.result())

    list_to_csv(agg_summoner_info)


def main():
    page_count = 1
    regions = ['na1', 'euw1', 'eun1', 'kr', 'br1', 'jp1', 'ru', 'oc1', 'tr1', 'la1', 'la2']
    get_summoner_data(page_count, regions)


if __name__ == '__main__':
    s = time.perf_counter()
    main()
    e = time.perf_counter()
    print(e - s)

Issue: Code is returning the same output for each iteration (The first link of the links list)

Above the following code pulls some information from the links variable using selenium. The issue is that when the threads are executing in the get_summoner_data() function, it is returning the same results every time. I'm not sure what the issue is coming from as the different links will print from each gather_summoner_info() call.

Currently it is just returning the information from the very first link. Not sure what is causing the issue, any help is appreciated.


Solution

  • Approach Try running without --headless option. You will see what's going on.

    Problem You created only one instance of web driver and that one is being used for all the threaded tasks. Multiple threads try to load different URLs on this single driver and finally it is very likely that the lastly tried URL will be loaded all the time.

    Fix Simple fix is to create a driver instance for every thread. You can do this by moving the line creating a web driver into the thread task function gather_summoner_info as below. I tried with this fix and it works correctly.

    def gather_summoner_info(url):
        ##### moved ######
        driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
        ##################
        driver.get(url)
        driver.implicitly_wait(5)  # Wait until the CSS Selector is available
    
        summoner_info = []
        content = driver.find_elements(By.CLASS_NAME, "rt-tr")
        for index, con in enumerate(content):
            if index != 0:
                summoner = con.text.split("\n")
                summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(" ")[0].replace(",", ""))])
            else:
                pass
    
        return summoner_info
    

    Further Consideration As you know, creating a new web driver instance is resource expensive. If you are just trying to scrape information, HTTP requests are enough most of the times.

    For the website you are trying to scrape, I found that the job can be done using HTTP requests only. I revised the script without using Selenium and it takes less than 1 second to load all the leaderboards for all regions.

    import json
    import time
    from concurrent.futures import ThreadPoolExecutor
    
    import requests
    
    
    def list_to_csv(summoner_info):
        summoner_info = sorted(summoner_info, key=lambda x: int(x[2]), reverse=True)
        with open("result.csv", "w", encoding="utf-8") as f:
            f.write("\n".join([",".join(item) for item in summoner_info]))
    
    
    def gather_summoner_info(region: str):
        payload = json.dumps(
            {
                "operationName": "getRankedLeaderboard",
                "variables": {"page": 1, "queueType": 420, "regionId": region},
                "query": "query getRankedLeaderboard($page: Int, $queueType: Int, $regionId: String!) {\n  leaderboardPage(page: $page, queueType: $queueType, regionId: $regionId) {\n    totalPlayerCount\n    topPlayerMostPlayedChamp\n    players {\n      iconId\n      losses\n      lp\n      overallRanking\n      rank\n      summonerLevel\n      summonerName\n      tier\n      wins\n      __typename\n    }\n    __typename\n  }\n}\n",
            }
        )
        headers = {"Content-Type": "application/json"}
        response = requests.post("https://u.gg/api", headers=headers, data=payload)
    
        summoner_info = []
        data = response.json()
        for player in data["data"]["leaderboardPage"]["players"]:
            summoner_info.append((player["summonerName"], player["tier"], player["lp"]))
    
        return summoner_info
    
    
    def get_summoner_data(page_count, regions):
        agg_summoner_info = []
        with ThreadPoolExecutor(max_workers=20) as executor:
            future_results = {r: executor.submit(gather_summoner_info, r) for r in regions}
            for _, future in future_results.items():
                agg_summoner_info.extend(future.result())
    
        list_to_csv(agg_summoner_info)
    
    
    def main():
        page_count = 1
        regions = ["na1", "euw1", "eun1", "kr", "br1", "jp1", "ru", "oc1", "tr1", "la1", "la2"]
        get_summoner_data(page_count, regions)
    
    
    if __name__ == "__main__":
        s = time.perf_counter()
        main()
        e = time.perf_counter()
        print(e - s)
    

    enter image description here