Search code examples
pythonpandasweb-scrapingselenium-chromedriverwebdriver

Empty pdf while scraping using Python and Selenium


I'm facing an issue while attempting to print a PDF from a webpage using Selenium in Python. The webpage in question is https://jamabandi.nic.in/land%20records/NakalRecord. I'm trying to select the first record from each drop-down and then click on the "Nakal" button to generate a PDF.

However, the resulting PDF is always empty, even though there is a table present on the webpage. I've tried both the manual print-to-PDF operation and automated printing using Selenium, but in both cases, the generated PDF is empty.

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

service = Service()
options = webdriver.ChromeOptions()
# Set up preferences for printing to PDF
settings = {
    "recentDestinations": [{"id": "Save as PDF", "origin": "local", "account": ""}],
    "selectedDestinationId": "Save as PDF",
    "version": 2
}
prefs = {
    'printing.print_preview_sticky_settings.appState': json.dumps(settings),
    'printing.print_to_file': True,
    'printing.print_to_file.path': '/Users/jatin/Downloads/output.pdf'  # Specify the desired output path
}
chrome_options.add_experimental_option('prefs', prefs)

import urllib.request
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
# Set up Chrome options
chrome_options = Options()
# chrome_options.add_argument('--headless')  # Optional: Run Chrome in headless mode
chrome_options.add_argument('--kiosk-printing')
try:
    service = Service(ChromeDriverManager().install())
except ValueError:
    latest_chromedriver_version_url = "https://chromedriver.storage.googleapis.com/LATEST_RELEASE"
    latest_chromedriver_version = urllib.request.urlopen(latest_chromedriver_version_url).read().decode('utf-8')
    service = Service(ChromeDriverManager(version=latest_chromedriver_version).install())

    
options = Options()
url='https://jamabandi.nic.in/land%20records/NakalRecord'
# options.add_argument('--headless') #optional.
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)


dropdown_district = Select(driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ddldname"]'))
dropdown_district.select_by_index(1)
# Select the tehsil dropdown element and choose the first option,we will loop here for multiple anchals
drop_down_tehsil = Select(driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ddltname"]'))
drop_down_tehsil.select_by_index(1)
drop_down_vill = Select(driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ddlvname"]'))
drop_down_vill.select_by_index(1)
drop_down_year = Select(driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ddlPeriod"]'))
drop_down_year.select_by_index(1)
owner_names=Select(driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ListBox1"]'))
dropdown_locator = (By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ListBox1"]')
drop_down_owner = Select(driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ddlOwner"]'))
drop_down_owner.select_by_index(1)
owner_names =Select(driver.find_element(By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_ListBox1"]'))
owner_names.select_by_index(2)
page_source = BeautifulSoup(driver.page_source, 'html.parser')
table = page_source.find_all('table')
div_col_lg_12 = page_source.find('div', class_='col-lg-12')

# Find links within the selected div
links_within_div = div_col_lg_12.find_all('td')
links_within_div
# Perform actions on the links or retrieve their attributes
for link in links_within_div:
    k=link.find_all('a')
    if len(k)>0:
        new_link=(k[0]['href'])
        
javascript_code = str(new_link)

# Execute the JavaScript code
driver.execute_script(javascript_code)

window_handles=driver.window_handles
driver.switch_to.window(window_handles[-1])

# Open the print dialog using JavaScript
driver.execute_script('window.print();')

enter image description here

enter image description here


Solution

  • The generated PDF is empty because in the document, you have:

    <style>
        @media print
        {
            html,body
            {
                display:none;
            }
        }
    </style>
    

    So the document content is hide from the print. What you need is to remove the <script> tag:

    driver.execute_script("document.querySelector('style').remove()")
    driver.execute_script("window.print()")
    

    enter image description here

    Edit:

    To extract data directly from page source, you can use pd.read_html:

    import io
    
    page_source = io.StringIO(driver.page_source)
    df = pd.read_html(page_source, attrs={'id': 'GridView1'})[0]
    

    Output:

    >>> df
        खेवट या जमाबंदी न.  खतौनी न.  ... माल और सवाई के ब्यौरे सहित मांग            अभियुक्ति
    0                    7      10.0  ...                             NaN        बरूऎ रपट न. 1
    1                   //       NaN  ...                             NaN  तिथी 26-09-2012 रजि
    2                    7       NaN  ...                             NaN          न.4173 तिथी
    3                  NaN       NaN  ...                             NaN   24/09/2012 4:27:00
    4                  NaN       NaN  ...                             NaN     PM के अनुसार मिन
    ..                 ...       ...  ...                             ...                  ...
    124                NaN       NaN  ...                             NaN                  NaN
    125                NaN       NaN  ...                             NaN                  NaN
    126                NaN       NaN  ...                             NaN                  NaN
    127                NaN       NaN  ...                             NaN                  NaN
    128                NaN       NaN  ...                             NaN                  NaN
    
    [129 rows x 12 columns]