Search code examples
pythonpandascsvpython-requestsurllib

Python – Need help storing <img> src's in CSV, download images from CSV list


I need help.

This code currently gets all src attributes from all <img> on the desired page, stores the URLs in a csv file (it's messy https://i.sstatic.net/KWV26.jpg), and downloads the first image from the first URL.

This is all great, but I want to download all photos, not just the first. (And hopefully clean up that CSV file from the code)

Side note: I understand I don't need to create a CSV to download the images. My goal is to store all img URLs into a CSV, then download the images from the URLs in the CSV

Anything helps!

from bs4 import BeautifulSoup
from time import sleep
import urllib.request
import pandas as pd
import requests
import urllib
import base64
import csv
import time





# Get site
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
page = driver.page_source
soup = BeautifulSoup(page)
# Gets srcs from all <img> from site 
srcs = [img['src'] for img in soup.findAll('img')]




# BELOW code = Writer writes all urls WITH comma after them

print ('Downloading URLs to file')
sleep(1)
with open('output.csv', 'w', newline='\n', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(srcs)



# Below is the code that only downloads the image from the first url. I intend for the code to download all images from all urls

print ('Downloading images to folder')
sleep(1)

filename = "output"

with open("{0}.csv".format(filename), 'r') as csvfile:
    # iterate on all lines
    i = 0
    for line in csvfile:
        splitted_line = line.split(',')
        # check if we have an image URL
        if splitted_line[1] != '' and splitted_line[1] != "\n":
            urllib.request.urlretrieve(splitted_line[1], "img_" + str(i) + ".png")
            print ("Image saved for {0}".format(splitted_line[0]))
            i += 1
        else:
            print ("No result for {0}".format(splitted_line[0]))

Solution

  • Here's another solution that keeps the CSV.

    from bs4 import BeautifulSoup
    from time import sleep
    import urllib.request
    import pandas as pd
    import requests
    import urllib
    import base64
    import csv
    import time
    
    
    # Get site
    headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
        }
        
    #page = driver.page_source
    page = "https://unsplash.com/"
    r = requests.get(page)
    soup = BeautifulSoup(r.text, "html.parser")
    # Gets srcs from all <img> from site 
    srcs = [img['src'] for img in soup.findAll('img')]
    
    # BELOW code = Writer writes all urls WITH comma after them
    
    print ('Downloading URLs to file')
    sleep(1)
    with open('output.csv', 'w', newline='\n', encoding='utf-8') as csvfile:
    #    writer = csv.writer(csvfile)
        for i,s in enumerate(srcs):  # each image number and URL
           csvfile.write(str(i) +','+s+'\n')
    
    # Below is the code that only downloads the image from the first url. I intend for the code to download all images from all urls
    
    print ('Downloading images to folder')
    sleep(1)
    
    filename = "output"
    
    with open("{0}.csv".format(filename), 'r') as csvfile:
        # iterate on all lines
        i = 0
        for line in csvfile:
            splitted_line = line.split(',')
            # check if we have an image URL
            if splitted_line[1] != '' and splitted_line[1] != "\n":
                urllib.request.urlretrieve(splitted_line[1], "img_" + str(i) + ".png")
                print ("Image saved for {0}".format(splitted_line[0]))
                i += 1
            else:
                print ("No result for {0}".format(splitted_line[0]))
    

    Output (output.csv)

    0,https://sb.scorecardresearch.com/p?c1=2&c2=32343279&cv=2.0&cj=1
    1,https://images.unsplash.com/photo-1597523565663-916cf059f524?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format%2Ccompress&fit=crop&w=1000&h=1000
    2,https://images.unsplash.com/profile-1574526450714-e5d331168827image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
    3,https://images.unsplash.com/photo-1599687350404-88b32c067289?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80
    4,https://images.unsplash.com/profile-1583427783052-3da8ceab5579image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
    5,https://images.unsplash.com/photo-1600181957705-92f267a2740e?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80
    6,https://images.unsplash.com/profile-1545567671893-842f479b15e2?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
    7,https://images.unsplash.com/photo-1600187723541-04457a98cc47?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80
    8,https://images.unsplash.com/photo-1599687350404-88b32c067289?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80
    9,https://images.unsplash.com/photo-1600181957705-92f267a2740e?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80
    10,https://images.unsplash.com/photo-1600187723541-04457a98cc47?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80