I am trying to scrape the paginated table from "https://tariffs.ib-net.org/sites/IBNET/TariffTable#" where url remains same for all pages.
I have already referred to below pages for help, but couldn't reach to a solution -
https://stackoverflow.com/questions/73362475/using-selenium-to-scrape-paginated-table-data-python
It was interesting to know from here "Scraping dynamic DataTable of many pages but same URL" that we can extract the json file but I have no clue how to do that.
Any help will be greatly appreciated,
My trials:
Code 1:
from lxml import html
import requests
import pandas as pd
import re
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from openpyxl import load_workbook
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
#from .actions.wheel_input import ScrollOrigin
from win32com.client import Dispatch
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import urllib
import requests
options = Options()
# options.add_argument('--headless')
#options.add_argument("start-maximized")
options.add_argument('disable-infobars')
driver=webdriver.Chrome(options=options)
url = 'https://tariffs.ib-net.org/sites/IBNET/TariffTable#'
driver.get(url)
time.sleep(10)
wait = WebDriverWait(driver, 10)
x=driver.find_element(By.XPATH,'//*[@id="datatab_length"]/label/select')
drop=Select(x)
drop.select_by_visible_text("100")
time.sleep(10)
table = wait.until(EC.presence_of_element_located((By.XPATH, "//table[@class='table table-striped table-hover table-bordered dataTable no-footer dtr-inline collapsed']")))
#utility=[]
#city=[]
#service=[]
#date=[]
#fifteenm3=[]
#fiftym3 = []
#hundredm3 = []
data_list=[]
while True:
# Extract data from the current page
rows = table.find_elements(By.XPATH, "//table[@class='table table-striped table-hover table-bordered dataTable no-footer dtr-inline collapsed']//tbody")
for row in rows:
columns = row.find_elements(By.TAG_NAME, "tr")
data_list.append([col.text.strip() for col in columns])
print(data_list)
next_button = driver.find_element(By.XPATH, "//*[@class='paginate_button next']/a")
if next_button:
# Click the next page button
next_button.click()
time.sleep(10)
continue
else:
break
Code 2:
from lxml import html
import requests
import pandas as pd
import re
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from openpyxl import load_workbook
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from win32com.client import Dispatch
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import urllib
import requests
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
options.add_argument("disable-gpu")
browser = webdriver.Chrome()
browser.maximize_window()
actions=ActionChains(browser)
browser.get("https://tariffs.ib-net.org/sites/IBNET/TariffTable#")
time.sleep(5)
table_header= browser.find_elements(By.XPATH,"//table[@id='datatab']/thead")
header_row = []
for header in table_header:
header_row.append(header.text)
#print(header_row)
utility=[]
city=[]
service=[]
date=[]
fifteenm3=[]
fiftym3=[]
hundredm3=[]
while True:
all_rows = browser.find_elements(By.XPATH,"//div[@class='row']//tbody")
for index in range(len(all_rows)):
all_columns = all_rows[index].find_elements(By.XPATH,"//*[@role='row']")
utility.append(all_columns[0].text)
print(utility)
city.append(all_columns[1].text)
print(city)
service.append(all_columns[2].text)
print(service)
date.append(all_columns[3].text)
print(date)
fifteenm3.append(all_columns[4].text)
print(fifteenm3)
fiftym3.append(all_columns[5].text)
print(fiftym3)
hundredm3.append(all_columns[6].text)
print(hundredm3)
if browser.find_element(By.XPATH,"//*[@class='paginate_button next']/a"):
browser.find_element(By.XPATH,"//*[@class='paginate_button next']/a").click()
time.sleep(5)
continue
else:
break
df=pd.DataFrame()
df['Utlity']=utility
df['service']=service
df['city']=city
df['date']=date
df['15m3']=fifteenm3
df['50m3']=fiftym3
df['100m3']=hundredm3
df.to_csv('data.csv')
The codes above either runs for few pages & then gets timed out, even after making the code sleep for a while in between or keeps on looping the same page after running properly for a while.
The answer mentioned on Scraping dynamic DataTable of many pages but same URL is applicable here. On a high level, to get the accurate URL and values, here is what you want to do:
Here is the python code for the request from my browser: https://pym.dev/p/2ueeq/. You can simplify this because a lot of this information is not required to be sent.
I am not familiar with your level of expertise with dev tools, so if you are stuck at any/all the steps here, lets discuss this in comments under this answer.