Search code examples
pythonhtmlseleniumweb-scrapingscreen-scraping

Scrape data that changes every time an "li" option is selected - Python Selenium


I'm trying to scrape data from that site https://www.pais.co.il/info/Thank-to.aspx (Ignore the hebrew).

I need to click on any of these options from the first dropdown menu

enter image description here

click on that button

enter image description here

and scrape these numbers

enter image description here

I do know how to scrape the numbers/ click or select buttons but I can't figure out how to iteratively select each option from that weird dropdown menu...

I do try to click on that button to open the dropdown menu as some suggestions over the internet but Unable to do so..:

enter image description here

    button1 = driver.find_element_by_xpath('/html/body/form/div[3]/div[1]/div/div/div[1]/select')

but I get the error: Message: no such element: Unable to locate element

would love your help for a newbie in the field of web scrapping :)


Solution

  • The data you need is loaded with js so you can use Selenium to get the list of cities. Here is one possible solution:

    import csv
    import requests
    from typing import Union, Any
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    
    def get_data(url: str, city_name: str) -> Union[dict[str, Any], str]:
        payload = {
        'city': city_name,
        'mainCategory': 'בחר תחום',
        'secondCategory': 'בחר תת תחום'
        }
        headers = {
            'User-Agent': 'Mozilla/5.0'
        }
        try:
            r =  requests.post(url, data=payload, headers=headers).json()
            return {
                    "City Name": city_name,
                    "Ventures": r[0],
                    "Realizable Investments": r[1],
                    "Realized Investments": r[2],
                    "Amount Invested Since 1989": r[3]
                }
        except ValueError:
            return f'No data for {city_name}'
    
    def save_to_csv(data: list) -> None:
        with open(file='pais.csv', mode='a', encoding="utf-8") as f:
            writer = csv.writer(f, lineterminator='\n')
            writer.writerow([*data])
    
    
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
    service = Service(executable_path="path/to/your/chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=options)
    wait = WebDriverWait(driver, 15)
    
    main_url = 'https://www.pais.co.il/info/Thank-to.aspx'
    post_call_url = 'https://www.pais.co.il/grants/grantsRequestNumbers.ashx'
    
    driver.get(main_url)
    wait.until(EC.frame_to_be_available_and_switch_to_it((By.TAG_NAME, "iframe")))
    cities = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#FacilitiesStats_ddlcity>option')))
    city_names = [city.text for city in cities[1:]]
    
    for name in city_names:
        data = get_data(post_call_url, name)
        if isinstance(data, dict):
            save_to_csv(data.values())
        else:
            print(data)
    
    driver.quit
    
    

    For some cities there is no data for example: "בוסתאן אל-מרג" so we just print to the console No data for בוסתאן אל-מרג

    Output csv file pais.csv:

    אבו גוש,19,6117232,14813422,20930654
    אבו סנאן,29,6517560,16225629,22743189
    אבן יהודה,28,3945008,13107701,17052709
    אום אל-פחם,76,56738614,200980004,257718618
    אופקים,109,21988456,130339851,152328307
    

    Tested on Python 3.9.10. Used Selenium 4.5.0 and requests 2.28.1

    Of course, we can get the required data using only Selenium without using the requests library. But after testing this solution, it seemed to me faster. Since when making post request we immediately get the value we need, while to receive data using Selenium from the tag(div.counter) we must wait for the counter animation to complete

    You can also use for example ThreadPoolExecutor then the process of getting and saving data will be much faster. Here is one possible solution:

    import csv
    import requests
    from itertools import repeat
    from typing import Union, Any
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from concurrent.futures import ThreadPoolExecutor
    
    
    def get_data(url: str, city_name: str) -> Union[dict[str, Any], str]:
        payload = {
        'city': city_name,
        'mainCategory': 'בחר תחום',
        'secondCategory': 'בחר תת תחום'
        }
        headers = {
            'User-Agent': 'Mozilla/5.0'
        }
        try:
            r =  requests.post(url, data=payload, headers=headers).json()
            return {
                    "City Name": city_name,
                    "Ventures": r[0],
                    "Realizable Investments": r[1],
                    "Realized Investments": r[2],
                    "Amount Invested Since 1989": r[3]
                }
        except ValueError:
            return f'No data for {city_name}'
    
    def save_to_csv(data: Union[dict, str]) -> None:
        if isinstance(data, dict):
            with open(file='pais.csv', mode='a', encoding="utf-8") as f:
                writer = csv.writer(f, lineterminator='\n')
                writer.writerow([*data.values()])
        else:
            print(data)
    
    
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
    service = Service(executable_path="path/to/your/chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=options)
    wait = WebDriverWait(driver, 15)
    
    main_url = 'https://www.pais.co.il/info/Thank-to.aspx'
    post_call_url = 'https://www.pais.co.il/grants/grantsRequestNumbers.ashx'
    
    driver.get(main_url)
    wait.until(EC.frame_to_be_available_and_switch_to_it((By.TAG_NAME, "iframe")))
    cities = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#FacilitiesStats_ddlcity>option')))
    city_names = [city.text for city in cities[1:]]
    
    with ThreadPoolExecutor() as executor:
        data = executor.map(get_data, repeat(post_call_url), city_names)
        executor.map(save_to_csv, data)
    
    driver.quit