from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor
import time
# Current time is :48.77885s per Page, 4.4344 per Region
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
def list_to_csv(summoner_info):
summoner_info = set([tuple(summoner) for summoner in summoner_info])
with open('high_elo_summoners.csv', 'w', encoding='utf-8') as f:
for summoner in summoner_info:
f.write(f"{summoner[0]},{summoner[1]},{summoner[2]}\n")
def gather_summoner_info(url):
driver.get(url)
driver.implicitly_wait(5) # Wait until the CSS Selector is available
summoner_info = []
content = driver.find_elements(By.CLASS_NAME, 'rt-tr')
for index, con in enumerate(content):
if index != 0:
summoner = con.text.split('\n')
summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(' ')[0].replace(',', ''))])
else:
pass
return summoner_info
def get_summoner_data(page_count, regions):
links = [f'https://u.gg/leaderboards/ranking?region={region}&page={page + 1}' for page in range(page_count) for
region in regions]
# Gather all the relevant summoner information on the page
agg_summoner_info = []
with ThreadPoolExecutor(max_workers=20) as executor:
future_results = {url : executor.submit(gather_summoner_info, url) for url in links}
for url, future in future_results.items():
#print(future.result())
agg_summoner_info.extend(future.result())
list_to_csv(agg_summoner_info)
def main():
page_count = 1
regions = ['na1', 'euw1', 'eun1', 'kr', 'br1', 'jp1', 'ru', 'oc1', 'tr1', 'la1', 'la2']
get_summoner_data(page_count, regions)
if __name__ == '__main__':
s = time.perf_counter()
main()
e = time.perf_counter()
print(e - s)
Issue: Code is returning the same output for each iteration (The first link of the
links
list)
Above the following code pulls some information from the links
variable using selenium. The issue is that when the threads are executing in the get_summoner_data()
function, it is returning the same results every time. I'm not sure what the issue is coming from as the different links will print from each gather_summoner_info()
call.
Currently it is just returning the information from the very first link. Not sure what is causing the issue, any help is appreciated.
Approach
Try running without --headless
option. You will see what's going on.
Problem You created only one instance of web driver and that one is being used for all the threaded tasks. Multiple threads try to load different URLs on this single driver and finally it is very likely that the lastly tried URL will be loaded all the time.
Fix
Simple fix is to create a driver instance for every thread.
You can do this by moving the line creating a web driver into the thread task function gather_summoner_info
as below. I tried with this fix and it works correctly.
def gather_summoner_info(url):
##### moved ######
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
##################
driver.get(url)
driver.implicitly_wait(5) # Wait until the CSS Selector is available
summoner_info = []
content = driver.find_elements(By.CLASS_NAME, "rt-tr")
for index, con in enumerate(content):
if index != 0:
summoner = con.text.split("\n")
summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(" ")[0].replace(",", ""))])
else:
pass
return summoner_info
Further Consideration As you know, creating a new web driver instance is resource expensive. If you are just trying to scrape information, HTTP requests are enough most of the times.
For the website you are trying to scrape, I found that the job can be done using HTTP requests only. I revised the script without using Selenium and it takes less than 1 second to load all the leaderboards for all regions.
import json
import time
from concurrent.futures import ThreadPoolExecutor
import requests
def list_to_csv(summoner_info):
summoner_info = sorted(summoner_info, key=lambda x: int(x[2]), reverse=True)
with open("result.csv", "w", encoding="utf-8") as f:
f.write("\n".join([",".join(item) for item in summoner_info]))
def gather_summoner_info(region: str):
payload = json.dumps(
{
"operationName": "getRankedLeaderboard",
"variables": {"page": 1, "queueType": 420, "regionId": region},
"query": "query getRankedLeaderboard($page: Int, $queueType: Int, $regionId: String!) {\n leaderboardPage(page: $page, queueType: $queueType, regionId: $regionId) {\n totalPlayerCount\n topPlayerMostPlayedChamp\n players {\n iconId\n losses\n lp\n overallRanking\n rank\n summonerLevel\n summonerName\n tier\n wins\n __typename\n }\n __typename\n }\n}\n",
}
)
headers = {"Content-Type": "application/json"}
response = requests.post("https://u.gg/api", headers=headers, data=payload)
summoner_info = []
data = response.json()
for player in data["data"]["leaderboardPage"]["players"]:
summoner_info.append((player["summonerName"], player["tier"], player["lp"]))
return summoner_info
def get_summoner_data(page_count, regions):
agg_summoner_info = []
with ThreadPoolExecutor(max_workers=20) as executor:
future_results = {r: executor.submit(gather_summoner_info, r) for r in regions}
for _, future in future_results.items():
agg_summoner_info.extend(future.result())
list_to_csv(agg_summoner_info)
def main():
page_count = 1
regions = ["na1", "euw1", "eun1", "kr", "br1", "jp1", "ru", "oc1", "tr1", "la1", "la2"]
get_summoner_data(page_count, regions)
if __name__ == "__main__":
s = time.perf_counter()
main()
e = time.perf_counter()
print(e - s)