Search code examples

Scraping/Crawling a website with multiple tabs using python

I am seeking assistance in extracting data from a website with multiple tabs and saving it in a .csv format using Python and Selenium. The website in question is:

There are five different tabs on the page, but my focus is on extracting data from the first three tabs.

1st Tab: First tab has 2 different options

2nd Tab: Second tab has 5 different options

3rd Tab: Third Tab has multiple different options

Additionally, there are two more tabs, one representing "ALL" and the other representing the "date." I need to retrieve data for all combinations of the first three tabs while keeping the "ALL" tab selected and the date set to the current date.

I was attempting to perform this operation using Selenium, but due to my limited experience with the tool, I was unable to achieve the desired outcome. Therefore, I am seeking guidance on how to proceed.

from selenium import webdriver
from import By
from import Select
from import WebDriverWait
from import expected_conditions as EC
import pandas as pd
import time
import random

def wait_for_element(driver, by, value, timeout=10):
    return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))

def scrape_and_save(driver, end_type, equity_type, cap_type, all_type, filename):
    # Select options from dropdowns
    Select(wait_for_element(driver, By.ID, "end-type")).select_by_value(end_type)
    time.sleep(random.uniform(1, 2))
    Select(wait_for_element(driver, By.ID, "equity-type")).select_by_value(equity_type)
    time.sleep(random.uniform(1, 2))
    Select(wait_for_element(driver, By.ID, "cap-type")).select_by_value(cap_type)
    time.sleep(random.uniform(1, 2))
    Select(wait_for_element(driver, By.ID, "all-type")).select_by_value(all_type)
    time.sleep(random.uniform(1, 2))
    # Click "Go" button
    wait_for_element(driver, By.ID, "go-button").click()
    # Wait for table to load
    table = wait_for_element(driver, By.ID, "fund-table", timeout=15)
    # Extract table data
    df = pd.read_html(table.get_attribute('outerHTML'))[0]
    # Save to CSV
    df.to_csv(filename, index=False)
    print(f"Saved data to {filename}")

# Set up Selenium WebDriver
driver = webdriver.Chrome()  # Make sure you have chromedriver installed and in PATH
driver.get("")  # Replace with actual URL

# Wait for initial page load
wait_for_element(driver, By.ID, "end-type", timeout=30)
print("Page loaded successfully")

# Define options for each dropdown
end_types = ["1", "2"]  # Open-ended, Closed-end
equity_types = ["1", "2", "3", "4", "5", "6"]  # Replace with actual values
cap_types = ["1", "2", "3", "4"]  # Replace with actual values
all_types = ["1", "2", "3", "4"]  # Replace with actual values

# Iterate through combinations
for end in end_types:
    for equity in equity_types:
        for cap in cap_types:
            for all_type in all_types:
                filename = f"fund_data_{end}_{equity}_{cap}_{all_type}.csv"
                    scrape_and_save(driver, end, equity, cap, all_type, filename)
                    time.sleep(random.uniform(3, 5))  # Random wait between 3 to 5 seconds
                except Exception as e:
                    print(f"Error scraping combination {end}_{equity}_{cap}_{all_type}: {str(e)}")



  • Your target app loads the table page from this app via Iframe, in this case, we can easily extract those data with the iframe page using bs4, here is the sample code with bs4 (bs4 is faster than selenium in this case):


    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    def fetchValue(primary_category, category, file):
        fund_name = []
        fund_benchmark = []
        riskometer_scheme = []
        riskometer_benchmark = []
        latest_nav_regular = []
        latest_nav_direct = []
        five_year_return_regular = []
        five_year_return_direct = []
        five_year_return_benchmark = []
        daily_aum_cr = []
        url = f'{primary_category}&category={category}&amc=ALL&nav-date=25-Oct-2024'
        resp = requests.get(url,headers={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0","Referer":""}).text
        soup = BeautifulSoup(resp, 'lxml')
        table = soup.findAll('tr')
        for i in table:
                content = i.findAll('td')
            except Exception:
        a = {
            "Scheme": fund_name,
            "Benchmark": fund_benchmark,
            "Riskometer_Scheme": riskometer_scheme,
            "Riskometer_Benchmark": riskometer_benchmark,
            "Lates_Nav_Regular": latest_nav_regular,
            "Lates_Nav_Direct": latest_nav_direct,
            "Five_Year_Retrun_Regular": five_year_return_regular,
            "Five_Year_Retrun_Direct": five_year_return_direct,
            "Five_Year_Retrun_Benchmark": five_year_return_benchmark,
            "Daily_AUM": daily_aum_cr
        df = pd.DataFrame(a)
        df.to_csv(file, index=False)
    url = ""
    resp = requests.get(url, headers={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0","Referer":""}).text
    soup = BeautifulSoup(resp, 'lxml')
    category_list = soup.find('select', id='category')
    for i in range(40):# has 40 category combinations on table 2-3 
        category = category_list.findAll('option')[i]['value']
        primary_category = category.split('_')[0]
        fetchValue(primary_category, category, f'{category}.csv')

    I tried as basic as possible with my code for better understanding