Search code examples
pythonweb-scrapingbeautifulsouppython-requests

How to scrape images and description metadata from nested divs?


I a trying to extract images and description metadata from the European Space Agency image gallery website:

https://www.esa.int/ESA_Multimedia/Sets/Earth_from_Space_image_collection/(result_type)/images

The high res images I am trying to extract, along with their descriptions and image credits, are only accessible by clicking on the postage stamp images and navigating to the download button.

I have tried to extract the images using beautifulsoup4 and requests in Python, but I can only seem to grab the postage stamp images on a single page of the gallery. Everything else appears to be spread out over multiple pages, obfuscated, or deeply nested.

Any ideas?

Here is my code:

import requests
from bs4 import BeautifulSoup
import os

base_url = "https://www.esa.int"
main_url = "https://www.esa.int/ESA_Multimedia/Sets/Earth_from_Space_image_collection/(result_type)/images"
response = requests.get(main_url)
soup = BeautifulSoup(response.content, 'html.parser')

# Get all the thumbnail links
thumb_links = [base_url + a['href'] for a in soup.find_all('a', class_='fancybox')]

if not os.path.exists('ESA_Full_Images'):
    os.makedirs('ESA_Full_Images')

# Iterate through each thumbnail link to find the full image URL
for idx, link in enumerate(thumb_links):
    img_page_response = requests.get(link)
    img_page_soup = BeautifulSoup(img_page_response.content, 'html.parser')

    # Find the full-size image URL
    full_img_tag = img_page_soup.find('div', class_='image').find('img')
    if full_img_tag:
        full_img_url = full_img_tag['src']
        full_img_url = base_url + full_img_url

        img_data = requests.get(full_img_url).content
        with open(f'ESA_Full_Images/image_{idx + 1}.jpg', 'wb') as handler:
            handler.write(img_data)
    else:
        print(f"Full image not found for thumbnail link: {link}")

print("Full images download completed!")

Solution

  • Your observations are correct, so follow the links to the detail pages and scrape the information from there.

    The example shows how, instead of the list for the preview images, you generate a list with the links to the detail pages and then scrape the data in structured form to convert it into whatever format you want.

    Now you can iterate over data and download your desired image data.

    Example
    import requests
    from bs4 import BeautifulSoup
    
    
    base_url = "https://www.esa.int"
    main_url = "https://www.esa.int/ESA_Multimedia/Sets/Earth_from_Space_image_collection/(result_type)/images"
    response = requests.get(main_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Get all popup links
    popup_links = [base_url + a['href'] for a in soup.find_all('a', class_='popup')]
    
    data = []
    
    for link in popup_links[:5]:
        # remove the slice of [:5] to get all data, this is only for demo purposes to avoid stressing the server 
        popup_soup = BeautifulSoup(requests.get(link).content, 'html.parser')
    
        d = {
            item.get('title').rsplit('Download ')[-1]:base_url + item.get('href') 
            for item in popup_soup.select('a.dropdown__item')
        }
    
        d.update({
            'title': popup_soup.h1.get_text(),
            'license': popup_soup.select_one('.modal__meta_licence').get_text(' ',strip=True)
        })
    
        data.append(d)
    
    data
    

    [{'LOW-RES JPG': 'https://www.esa.int/var/esa/storage/images/esa_multimedia/images/2024/05/earth_from_space_bolivian_salt_lakes/26073833-1-eng-GB/Earth_from_Space_Bolivian_salt_lakes.jpg',
      'HI-RES JPG': 'https://www.esa.int/var/esa/storage/images/esa_multimedia/images/2024/05/earth_from_space_bolivian_salt_lakes/26073834-1-eng-GB/Earth_from_Space_Bolivian_salt_lakes.jpg',
      'HI-RES TIF': 'https://www.esa.inthttps://esamultimedia.esa.int/images/EarthObservation/LakePoopo_S2_20240422_20m_843_3857_enhanced_v2.tif',
      'title': 'Earth from Space: Bolivian salt lakes',
      'license': 'CREDIT contains modified Copernicus Sentinel data (2024), processed by ESA LICENCE CC BY-SA 3.0 IGO or ESA Standard Licence (content can be used under either licence)'},
     {'LOW-RES JPG': 'https://www.esa.int/var/esa/storage/images/esa_multimedia/images/2024/05/earth_from_space_namibian_landforms/26062593-1-eng-GB/Earth_from_Space_Namibian_landforms.jpg',
      'HI-RES JPG': 'https://www.esa.int/var/esa/storage/images/esa_multimedia/images/2024/05/earth_from_space_namibian_landforms/26062594-1-eng-GB/Earth_from_Space_Namibian_landforms.jpg',
      'HI-RES TIF': 'https://www.esa.inthttps://esamultimedia.esa.int/images/EarthObservation/Messum_Crater_Namibia_S2_20240418_832_v2.tif',
      'title': 'Earth from Space: Namibian landforms',
      'license': 'CREDIT contains modified Copernicus Sentinel data (2024), processed by ESA LICENCE CC BY-SA 3.0 IGO or ESA Standard Licence (content can be used under either licence)'},
     {'LOW-RES PNG': 'https://www.esa.int/var/esa/storage/images/esa_multimedia/images/2024/04/earth_from_space_seychelles/26051310-1-eng-GB/Earth_from_Space_Seychelles.png',
      'HI-RES PNG': 'https://www.esa.int/var/esa/storage/images/esa_multimedia/images/2024/04/earth_from_space_seychelles/26051311-1-eng-GB/Earth_from_Space_Seychelles.png',
      'HI-RES TIF': 'https://www.esa.inthttps://esamultimedia.esa.int/images/EarthObservation/seychelles_S2_20211125_CF_v7.tif',
      'title': 'Earth from Space: Seychelles',
      'license': 'CREDIT contains modified Copernicus Sentinel data (2021), processed by ESA LICENCE CC BY-SA 3.0 IGO or ESA Standard Licence (content can be used under either licence)'},
     {'LOW-RES JPG': 'https://www.esa.int/var/esa/storage/images/esa_multimedia/images/2024/04/earth_from_space_the_mekong_delta/26037485-1-eng-GB/Earth_from_Space_The_Mekong_Delta.jpg',
      'HI-RES JPG': 'https://www.esa.int/var/esa/storage/images/esa_multimedia/images/2024/04/earth_from_space_the_mekong_delta/26037486-1-eng-GB/Earth_from_Space_The_Mekong_Delta.jpg',
      'HI-RES TIF': 'https://www.esa.inthttps://esamultimedia.esa.int/images/EarthObservation/Can_Tho_S2_20230326_10m.tif',
      'title': 'Earth from Space: The Mekong Delta',
      'license': 'CREDIT contains modified Copernicus Sentinel data (2023), processed by ESA LICENCE CC BY-SA 3.0 IGO or ESA Standard Licence (content can be used under either licence)'},
     {'LOW-RES JPG': 'https://www.esa.int/var/esa/storage/images/esa_multimedia/images/2024/04/earth_from_space_the_ebro_delta/26025534-2-eng-GB/Earth_from_Space_The_Ebro_Delta.jpg',
      'HI-RES JPG': 'https://www.esa.int/var/esa/storage/images/esa_multimedia/images/2024/04/earth_from_space_the_ebro_delta/26025535-1-eng-GB/Earth_from_Space_The_Ebro_Delta.jpg',
      'HI-RES TIF': 'https://www.esa.inthttps://esamultimedia.esa.int/docs/EarthObservation/EbroDelta_S2_20240125_10m_432_3857_enhanced.tif',
      'title': 'Earth from Space: The Ebro Delta',
      'license': 'CREDIT contains modified Copernicus Sentinel data (2024), processed by ESA LICENCE CC BY-SA 3.0 IGO or ESA Standard Licence (content can be used under either licence)'}]