I am trying to scrape a table from a JavaScript website using selenium in python. The process is to submit a form using selenium driver through python and then retrieving the loaded page. As the website is slow, sometimes the table which is loaded for the previous selection remains in the page and the code scrapes the wrong data instead of waiting for the new table to load. I want to ensure that the table I scrape corresponds with the selection I made from the dropdown. As the page may take anywhere between 5 seconds to several minutes to load, specifying waiting time may not work. So I put a second while
loop within the first while
loop to verify that the names in the dropdown and the resulting table are same before scraping the data. But it seems not to be working and the data is being scraped without verifying the names first. I am giving a sample below. Please tell me how to get it correct.
import pandas as pd
import io
import time
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
from selenium.webdriver.support.ui import Select
from retry import retry
# Web page url
driver.get("http://mnregaweb4.nic.in/netnrega/dynamic_work_details.aspx?page=S&lflag=eng&state_name=KERALA&state_code=16&fin_year=2020-2021&source=national&Digest=s5wXOIOkT98cNVkcwF6NQA")
x=0
while True:
try:
driver.implicitly_wait(5)
# Find District of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_dist')
drop = Select(x)
# Select by value
drop.select_by_value("1613")
time.sleep(6)
# Find Block of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_blk')
drop = Select(x)
# Select by value
drop.select_by_value("1613001")
time.sleep(4)
# Find GP of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_pan')
drop = Select(x)
# Select by value
drop.select_by_value("1613001001")
time.sleep(4)
search_button = driver.find_element_by_id("ContentPlaceHolder1_Button1")
search_button.click()
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
tables = soup.find_all('table')
dfs = pd.read_html(str(tables))
#print(dfs[4])
df1 = pd.read_csv(io.StringIO(dfs[4].to_csv(index=False)), skiprows=1, header=[0,1])
c = df1.iat[1,3]
print(c)
c == "Alayamon"
while True:
try:
df1.to_csv("break.csv", index=False)
break
except:
if x:
print("waiting...")
x = False
driver.close()
break
except:
if x:
print("Error")
x = False
Hi hope this helps just slot it in before your webpage loads
driver = chromedriver()
driver.get("http://mnregaweb4.nic.in/netnrega/dynamic_work_details.aspx?page=S&lflag=eng&state_name=KERALA&state_code=16&fin_year=2020-2021&source=national&Digest=s5wXOIOkT98cNVkcwF6NQA")
timeout = 3
try:
element_present = EC.presence_of_element_located((By.ID, 'main'))
WebDriverWait(driver, timeout).until(element_present)
except TimeoutException:
print("Timed out waiting for page to load")
finally:
print("Page loaded")
Documentation found on https://pythonbasics.org/selenium-wait-for-page-to-load/