Search code examples
pythonscreen-scraping

Download image file from the HTML page source


I am writing a scraper that downloads all the image files from a HTML page and saves them to a specific folder. All the images are part of the HTML page.


Solution

  • Here is some code to download all the images from the supplied URL, and save them in the specified output folder. You can modify it to your own needs.

    """
    dumpimages.py
        Downloads all the images on the supplied URL, and saves them to the
        specified output file ("/test/" by default)
    
    Usage:
        python dumpimages.py http://example.com/ [output]
    """
    from bs4 import BeautifulSoup as bs
    from urllib.request import (
        urlopen, urlparse, urlunparse, urlretrieve)
    import os
    import sys
    
    def main(url, out_folder="/test/"):
        """Downloads all the images at 'url' to /test/"""
        soup = bs(urlopen(url))
        parsed = list(urlparse(url))
    
        for image in soup.findAll("img"):
            print("Image: %(src)s" % image)
            filename = image["src"].split("/")[-1]
            parsed[2] = image["src"]
            outpath = os.path.join(out_folder, filename)
            if image["src"].lower().startswith("http"):
                urlretrieve(image["src"], outpath)
            else:
                urlretrieve(urlunparse(parsed), outpath)
    
    def _usage():
        print("usage: python dumpimages.py http://example.com [outpath]")
    
    if __name__ == "__main__":
        url = sys.argv[-1]
        out_folder = "/test/"
        if not url.lower().startswith("http"):
            out_folder = sys.argv[-1]
            url = sys.argv[-2]
            if not url.lower().startswith("http"):
                _usage()
                sys.exit(-1)
        main(url, out_folder)
    

    Edit: You can specify the output folder now.