Search code examples
pythonbeautifulsoupocr

I would like to download all images from this archive, what should i add to my code?


https://permalink.geldersarchief.nl/8A0A3B746F8147888ADF8FCA559F119B

this archive has 500 images i want to download and perform OCR on. I have already found this code online that downloads some images, but it doesn't find the 500 images of the book that i want for some reason. what should i add to the code? thanks in advance.

from bs4 import *
import requests
import os


# CREATE FOLDER
def folder_create(images):
    try:
        folder_name = input("Enter Folder Name:- ")
        # folder creation
        os.mkdir(folder_name)

    # if folder exists with that name, ask another name
    except:
        print("Folder Exist with that name!")
        folder_create()

    # image downloading start
    download_images(images, folder_name)


# DOWNLOAD ALL IMAGES FROM THAT URL
def download_images(images, folder_name):
    # initial count is zero
    count = 0

    # print total images found in URL
    print(f"Total {len(images)} Image Found!")

    # checking if images is not zero
    if len(images) != 0:
        for i, image in enumerate(images):
            # From image tag ,Fetch image Source URL

            # 1.data-srcset
            # 2.data-src
            # 3.data-fallback-src
            # 4.src

            # Here we will use exception handling

            # first we will search for "data-srcset" in img tag
            try:
                # In image tag ,searching for "data-srcset"
                image_link = image["data-srcset"]

            # then we will search for "data-src" in img
            # tag and so on..
            except:
                try:
                    # In image tag ,searching for "data-src"
                    image_link = image["data-src"]
                except:
                    try:
                        # In image tag ,searching for "data-fallback-src"
                        image_link = image["data-fallback-src"]
                    except:
                        try:
                            # In image tag ,searching for "src"
                            image_link = image["src"]

                        # if no Source URL found
                        except:
                            pass

            # After getting Image Source URL
            # We will try to get the content of image
            try:
                r = requests.get(image_link).content
                try:

                    # possibility of decode
                    r = str(r, 'utf-8')

                except UnicodeDecodeError:

                    # After checking above condition, Image Download start
                    with open(f"{folder_name}/images{i + 1}.jpg", "wb+") as f:
                        f.write(r)

                    # counting number of image downloaded
                    count += 1
            except:
                pass

        # There might be possible, that all
        # images not download
        # if all images download
        if count == len(images):
            print("All Images Downloaded!")

        # if all images not download
        else:
            print(f"Total {count} Images Downloaded Out of {len(images)}")


# MAIN FUNCTION START
def main(url):
    # content of URL
    r = requests.get(url)

    # Parse HTML Code
    soup = BeautifulSoup(r.text, 'html.parser')

    # find all images in URL
    images = soup.findAll('img')

    # Call folder create function
    folder_create(images)


# take url
url = input("Enter URL:- ")

# CALL MAIN FUNCTION
main(url)

I ran the code on the given url, it said it found 52 images but only downloaded 2. (likely encoding issue). i was expecting it to download all 500 images that are in there.


Solution

  • You could use the wget module and loop through the file names as they are publicly accessible in lieu of scraping the page.

    import wget
    
    number_range = ['{0:04}'.format(num) for num in range(0, 501)]
    
    for number in number_range:
        url = 'https://preserve2.archieven.nl/mi-37/fonc-gea/2000/2064/NL-AhGldA_2000_2064-' + number + '.jpg'  
        downloaded_file = wget.download(url)
    

    Credit to this Q&A for the formatting range solution.