I am trying to scrape the table contained in the following page: https://predictioncenter.org/casp14/results.cgi?view=tables&target=T1024&model=1&groups_id=
At the top of the table, I want to change model "1" by "- All -". I was writing the following lines of code:
link = f"https://predictioncenter.org/casp14/results.cgi?view=tables&target=T1024&model=- All -&groups_id="
browser.get(link)
but this isn't working.
When I replace model=- All -
by model=1
the code works, so I suspect there is something going on with my - All - option, but I can't figure out what.
Full code below with the loop through all Targets and Model options (the version above was simplified):
from bs4 import BeautifulSoup,NavigableString, Tag
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import csv
import os
import numpy as np
os.chdir('THE DIRECTORY WHERE YOUR CHROMEDRIVER IS')
options = webdriver.ChromeOptions()
options.add_argument("headless")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
browser = webdriver.Chrome(executable_path='THE DIRECTORY WHERE YOUR CHROMEDRIVER IS/chromedriver')
browser.get("https://predictioncenter.org/") #open page in browser
df = pd.DataFrame()
x = browser.find_elements(By.XPATH, "//a[contains(@id, 'ygtvlabelel6')]")[0].click()
x = browser.find_elements(By.XPATH, "//a[contains(@href, 'results.cgi')]")[0].click()
x = browser.find_elements(By.XPATH, "//a[contains(@id, 'a_T1024')]")[0].click()
content = browser.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
#Get all possible options
options = soup.find("select",{"name":"target"}).findAll("option")
list_prot = []
for i in options:
name = i.text
list_prot.append(name)
type_model = soup.find("select",{"name":"model"}).findAll("option")
model_t=[]
for i in type_model:
name = i.text
model_t.append(name)
mod=model_t[0]
i=0
final=pd.DataFrame()
for target in list_prot:
print(i)
link = f"https://predictioncenter.org/casp14/results.cgi?view=tables&target={target}&model={mod}&groups_id="
browser.get(link)
You can select '- all -' from the model dropdown using selenium as it requires to click on the dropdown, then select the desired value using select_by_index()
method and it should work as expectation.
Full Working code:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
URL = 'https://predictioncenter.org/casp14/results.cgi?view=tables&target=T1024&model=1&groups_id='
driver.get(URL)
driver.maximize_window()
time.sleep(5)
WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.table > tbody tr:first-child > td:nth-child(3)'))).click()
time.sleep(2)
dropdown=Select(driver.find_element(By.CSS_SELECTOR,'select#model'))
time.sleep(2)
dropdown.select_by_index(0)
time.sleep(2)
soup = BeautifulSoup(driver.page_source, "html.parser")
table= soup.select_one('.table_results')
df = pd.read_html(str(table))[0]
print(df)
driver.quit() # close browser
Output:
0 1 2 3 4 ... 24 25 26 27 28
0 General General General General General ... Handed. TM TM FlexE ASE
1 # Model GR# GR Name Charts ... Handed. TMscore TMalign FlexE ASE
2 # NaN NaN NaN NaN ... NaN NaN NaN NaN NaN
3 NaN Model NaN NaN NaN ... NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN
.. ... ... ... ... ... ... ... ... ... ... ...
592 508. T1024TS170_4 170 s BhageerathH-Plus A D I G ... 0.53 0.23 0.42 201.20 NaN
593 509. T1024TS063_5 063 s ACOMPMOD A D I G ... 0.50 0.17 0.27 746.50 92.81
594 510. T1024TS305_1 305 s CAO-SERVER A D I G ... 0.48 0.22 0.30 151.85 18.11
595 511. T1024TS342_2 342 CUTSP A D I G ... 0.49 0.15 0.28 410.17 NaN
596 512. T1024TS217_5 217 CAO-QA1 A D I G ... 0.50 0.20 0.32 167.41 17.24
[597 rows x 29 columns]