Search code examples
pythonseleniumselenium-webdriverxpathattributes

With python and selenium, how to find the hidden links of files on a website?


In python3 and selenium I want to capture PDFs file links from one page. In Inspect Element I didn't find these links, it seems that they are generated

So on the site I looked for the exact location, the "Documentos" links box - in it there is a list of links (Certidão), when you click it opens a new tab with the PDF - example

I then made the script below that looks for the XPATH elements in the PDFs links box and then calls a function that should look for the exact attributes of the links

But it's not working. Please does anyone know what I could do to fix this or another method?

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select


site = "https://divulgacandcontas.tse.jus.br/divulga/#/candidato/2022/2040602022/AP/30001653385"


# Function to get the links with attribute
def find(elem):
    element = elem.get_attribute("dvg-link-doc dvg-certidao")
    if element:
        return element
    else:
        return False

driver = webdriver.Chrome('D:\Code\chromedriver.exe') 
driver.get(site)


documentss = []
# Look for the elements in the box where the PDFs are
elems = driver.find_elements("xpath", '/html/body/div[2]/div[1]/div/div[1]/section[3]/div/div[3]/div[2]/div/div/ul')


# Iterate over the elements found
for elem in elems:
    
              
    # Test if there is a link available
    try:
        links = WebDriverWait(elem, 2).until(find)
        print(links)
        
        if links.endswith(".pdf"):
            print(links)
            dicionario = {"link": links}
            documents.append(dicionario)
        
    except:
        continue

Solution

  • This is one way of obtaining the urls for the pdf files un der 'Documentos' (brown links):

    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    import time as t
    
    
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument('disable-notifications')
    chrome_options.add_argument("window-size=1280,720")
    
    webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
    browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
    
    url = "https://divulgacandcontas.tse.jus.br/divulga/#/candidato/2022/2040602022/AP/30001653385"
    
    counter = 0
    
    browser.get(url) 
    
    
    
    links = WebDriverWait(browser, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".dvg-link-doc.dvg-certidao")))
    for x in range(len(links)):
        current_link = links[counter]
        print(current_link.text)
        t.sleep(1)
        current_link.click()
        t.sleep(1)
        browser.switch_to.window(browser.window_handles[-1])
        print(browser.current_url)
        t.sleep(1)
        browser.get(url) 
        counter = counter + 1
        links = WebDriverWait(browser, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".dvg-link-doc.dvg-certidao")))
        t.sleep(1)
    

    This will print out in terminal:

    Certidão criminal da Justiça Federal de 2º grau
    https://divulgacandcontas.tse.jus.br/candidaturas/oficial/2022/BR/AP/546/candidatos/897646/12_1659631723977.pdf
    Certidão criminal da Justiça Federal de 1º grau
    https://divulgacandcontas.tse.jus.br/candidaturas/oficial/2022/BR/AP/546/candidatos/897646/11_1659631722277.pdf
    Certidão criminal da Justiça Estadual de 2º grau
    https://divulgacandcontas.tse.jus.br/candidaturas/oficial/2022/BR/AP/546/candidatos/897646/14_1659631720538.pdf
    Certidão criminal da Justiça Estadual de 1º grau
    https://divulgacandcontas.tse.jus.br/candidaturas/oficial/2022/BR/AP/546/candidatos/897646/13_1659631719616.pdf
    

    You need to adapt the code to your own selenium setup, just observe the imports and the code after defining the browser/driver. Selenium docs: https://www.selenium.dev/documentation/