Search code examples
web-scrapingselectbeautifulsoup

Issue when trying to select an option from a list for scraping - Python


I am trying to scrape the table contained in the following page: https://predictioncenter.org/casp14/results.cgi?view=tables&target=T1024&model=1&groups_id=

At the top of the table, I want to change model "1" by "- All -". I was writing the following lines of code:

link = f"https://predictioncenter.org/casp14/results.cgi?view=tables&target=T1024&model=- All -&groups_id="
browser.get(link)

but this isn't working.

When I replace model=- All - by model=1 the code works, so I suspect there is something going on with my - All - option, but I can't figure out what.

Full code below with the loop through all Targets and Model options (the version above was simplified):

from bs4 import BeautifulSoup,NavigableString, Tag 
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import csv
import os
import numpy as np

os.chdir('THE DIRECTORY WHERE YOUR CHROMEDRIVER IS')
options = webdriver.ChromeOptions()
options.add_argument("headless")
options.add_experimental_option('excludeSwitches', ['enable-logging'])


browser = webdriver.Chrome(executable_path='THE DIRECTORY WHERE YOUR CHROMEDRIVER IS/chromedriver')
browser.get("https://predictioncenter.org/") #open page in browser

df = pd.DataFrame()

x = browser.find_elements(By.XPATH, "//a[contains(@id, 'ygtvlabelel6')]")[0].click()
x = browser.find_elements(By.XPATH, "//a[contains(@href, 'results.cgi')]")[0].click()
x = browser.find_elements(By.XPATH, "//a[contains(@id, 'a_T1024')]")[0].click()   

content = browser.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
    
#Get all possible options 
options = soup.find("select",{"name":"target"}).findAll("option")
list_prot = []
for i in options:
    name = i.text
    list_prot.append(name)

type_model = soup.find("select",{"name":"model"}).findAll("option")    
model_t=[]
for i in type_model:
    name = i.text
    model_t.append(name)

mod=model_t[0]

i=0
final=pd.DataFrame()
for target in list_prot:
    print(i)
    link = f"https://predictioncenter.org/casp14/results.cgi?view=tables&target={target}&model={mod}&groups_id="
    browser.get(link)

Solution

  • You can select '- all -' from the model dropdown using selenium as it requires to click on the dropdown, then select the desired value using select_by_index() method and it should work as expectation.

    Full Working code:

    import pandas as pd
    from bs4 import BeautifulSoup
    from selenium import webdriver
    import time
    from selenium.webdriver.chrome.service import Service
    from webdriver_manager.chrome import ChromeDriverManager
    
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.ui import Select
    
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
                        
    URL = 'https://predictioncenter.org/casp14/results.cgi?view=tables&target=T1024&model=1&groups_id='
    driver.get(URL)
    driver.maximize_window()
    time.sleep(5)
    
    
    WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.table > tbody tr:first-child > td:nth-child(3)'))).click()
    time.sleep(2)
    
    dropdown=Select(driver.find_element(By.CSS_SELECTOR,'select#model'))
    time.sleep(2)
    dropdown.select_by_index(0)
    time.sleep(2)
    
    soup = BeautifulSoup(driver.page_source, "html.parser")
    table= soup.select_one('.table_results')
    
    df = pd.read_html(str(table))[0]
    print(df)
    
    driver.quit() # close browser
    

    Output:

        0             1        2                 3           4   ...       24       25       26      27     28
    0    General       General  General           General     General  ...  Handed.       TM       TM   FlexE    ASE
    1          #         Model      GR#           GR Name      Charts  ...  Handed.  TMscore  TMalign   FlexE    ASE
    2          #           NaN      NaN               NaN         NaN  ...      NaN      NaN      NaN     NaN    NaN
    3        NaN         Model      NaN               NaN         NaN  ...      NaN      NaN      NaN     NaN    NaN
    4        NaN           NaN      NaN               NaN         NaN  ...      NaN      NaN      NaN     NaN    NaN
    ..       ...           ...      ...               ...         ...  ...      ...      ...      ...     ...    ...
    592     508.  T1024TS170_4    170 s  BhageerathH-Plus  A  D  I  G  ...     0.53     0.23     0.42  201.20    NaN
    593     509.  T1024TS063_5    063 s          ACOMPMOD  A  D  I  G  ...     0.50     0.17     0.27  746.50  92.81
    594     510.  T1024TS305_1    305 s        CAO-SERVER  A  D  I  G  ...     0.48     0.22     0.30  151.85  18.11
    595     511.  T1024TS342_2      342             CUTSP  A  D  I  G  ...     0.49     0.15     0.28  410.17    NaN
    596     512.  T1024TS217_5      217           CAO-QA1  A  D  I  G  ...     0.50     0.20     0.32  167.41  17.24
    
    [597 rows x 29 columns]