Search code examples
pythonbeautifulsouppython-requests-html

BeautifulSoup can not find every link in page


Here is my code:

from bs4 import BeautifulSoup
import requests
from requests import get
import os

def file_download():


    domain = "ec.europa.eu"
    page = requests.get("https://ec.europa.eu/eurostat/web/main/data/database")
   

    html = page.text
    soup = BeautifulSoup(html, "html.parser")

    for link in soup.find_all('a'):
        url = link.get('href')
        print(url)
        if ".gz" in url:
            file_name = url.split("file=", 1)[1]
            if os.path.exists(file_name):
                print("File already exists.")
                continue
            else:
                with open(file_name, 'wb') as file:
                    print('Downloading...')
                    response = get(url)
                    file.write(response.content)
                    continue
        else:
            continue

    print('\nEvery file has been downloaded!')

In the above code I can not seem to find every possible link in from the page. In chrome inspection copied element provides me with what I wrote as comment. That is what I want to find with beautifulsoup as well as other similar links.


Solution

  • It is probably best to avoid accessing the files via the tree structure (as it would require a lot of JSON interactions).

    An easier approach is to use their file listing of all of their files:

    from bs4 import BeautifulSoup
    import requests
    
    session = requests.Session()
    req_all = session.get("https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?dir=data&sort=1&sort=2&start=all")
    soup = BeautifulSoup(req_all.content, "lxml")
    table = soup.find('table', id='filetable')
    
    for a in table.find_all('a', href=True):
        if a.text == "Download":
            href = a['href']
        
            if '.gz' in href:
                filename = href.rsplit('%2F', 1)[1]
                    
                if not os.path.exists(filename):
                    with open(filename, 'wb') as f_gz:
                        f_gz.write(requests.get(href).content)
                        print(filename)