python-3.x selenium python-multithreading threadpoolexecutor selenium-webdriver-python

Unexpected Multithreading Output when Web Scraping with Selenium (Python)

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor
import time

# Current time is :48.77885s per Page, 4.4344 per Region
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)


def list_to_csv(summoner_info):
    summoner_info = set([tuple(summoner) for summoner in summoner_info])
    with open('high_elo_summoners.csv', 'w', encoding='utf-8') as f:
        for summoner in summoner_info:
            f.write(f"{summoner[0]},{summoner[1]},{summoner[2]}\n")


def gather_summoner_info(url):
    driver.get(url)
    driver.implicitly_wait(5) # Wait until the CSS Selector is available

    summoner_info = []
    content = driver.find_elements(By.CLASS_NAME, 'rt-tr')
    for index, con in enumerate(content):
        if index != 0:
            summoner = con.text.split('\n')
            summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(' ')[0].replace(',', ''))])
        else:
            pass
    return summoner_info


def get_summoner_data(page_count, regions):
    links = [f'https://u.gg/leaderboards/ranking?region={region}&page={page + 1}' for page in range(page_count) for
             region in regions]

    # Gather all the relevant summoner information on the page
    agg_summoner_info = []
    with ThreadPoolExecutor(max_workers=20) as executor:
            future_results = {url : executor.submit(gather_summoner_info, url) for url in links}
            for url, future in future_results.items():
                #print(future.result())
                agg_summoner_info.extend(future.result())

    list_to_csv(agg_summoner_info)


def main():
    page_count = 1
    regions = ['na1', 'euw1', 'eun1', 'kr', 'br1', 'jp1', 'ru', 'oc1', 'tr1', 'la1', 'la2']
    get_summoner_data(page_count, regions)


if __name__ == '__main__':
    s = time.perf_counter()
    main()
    e = time.perf_counter()
    print(e - s)

Issue: Code is returning the same output for each iteration (The first link of the links list)

Above the following code pulls some information from the links variable using selenium. The issue is that when the threads are executing in the get_summoner_data() function, it is returning the same results every time. I'm not sure what the issue is coming from as the different links will print from each gather_summoner_info() call.

Currently it is just returning the information from the very first link. Not sure what is causing the issue, any help is appreciated.

Solution

Approach Try running without --headless option. You will see what's going on.

Problem You created only one instance of web driver and that one is being used for all the threaded tasks. Multiple threads try to load different URLs on this single driver and finally it is very likely that the lastly tried URL will be loaded all the time.

Fix Simple fix is to create a driver instance for every thread. You can do this by moving the line creating a web driver into the thread task function gather_summoner_info as below. I tried with this fix and it works correctly.

def gather_summoner_info(url):
    ##### moved ######
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    ##################
    driver.get(url)
    driver.implicitly_wait(5)  # Wait until the CSS Selector is available

    summoner_info = []
    content = driver.find_elements(By.CLASS_NAME, "rt-tr")
    for index, con in enumerate(content):
        if index != 0:
            summoner = con.text.split("\n")
            summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(" ")[0].replace(",", ""))])
        else:
            pass

    return summoner_info

Further Consideration As you know, creating a new web driver instance is resource expensive. If you are just trying to scrape information, HTTP requests are enough most of the times.

For the website you are trying to scrape, I found that the job can be done using HTTP requests only. I revised the script without using Selenium and it takes less than 1 second to load all the leaderboards for all regions.

import json
import time
from concurrent.futures import ThreadPoolExecutor

import requests


def list_to_csv(summoner_info):
    summoner_info = sorted(summoner_info, key=lambda x: int(x[2]), reverse=True)
    with open("result.csv", "w", encoding="utf-8") as f:
        f.write("\n".join([",".join(item) for item in summoner_info]))


def gather_summoner_info(region: str):
    payload = json.dumps(
        {
            "operationName": "getRankedLeaderboard",
            "variables": {"page": 1, "queueType": 420, "regionId": region},
            "query": "query getRankedLeaderboard($page: Int, $queueType: Int, $regionId: String!) {\n  leaderboardPage(page: $page, queueType: $queueType, regionId: $regionId) {\n    totalPlayerCount\n    topPlayerMostPlayedChamp\n    players {\n      iconId\n      losses\n      lp\n      overallRanking\n      rank\n      summonerLevel\n      summonerName\n      tier\n      wins\n      __typename\n    }\n    __typename\n  }\n}\n",
        }
    )
    headers = {"Content-Type": "application/json"}
    response = requests.post("https://u.gg/api", headers=headers, data=payload)

    summoner_info = []
    data = response.json()
    for player in data["data"]["leaderboardPage"]["players"]:
        summoner_info.append((player["summonerName"], player["tier"], player["lp"]))

    return summoner_info


def get_summoner_data(page_count, regions):
    agg_summoner_info = []
    with ThreadPoolExecutor(max_workers=20) as executor:
        future_results = {r: executor.submit(gather_summoner_info, r) for r in regions}
        for _, future in future_results.items():
            agg_summoner_info.extend(future.result())

    list_to_csv(agg_summoner_info)


def main():
    page_count = 1
    regions = ["na1", "euw1", "eun1", "kr", "br1", "jp1", "ru", "oc1", "tr1", "la1", "la2"]
    get_summoner_data(page_count, regions)


if __name__ == "__main__":
    s = time.perf_counter()
    main()
    e = time.perf_counter()
    print(e - s)