python pandas csv python-requests urllib

Python – Need help storing <img> src's in CSV, download images from CSV list

I need help.

This code currently gets all src attributes from all <img> on the desired page, stores the URLs in a csv file (it's messy https://i.sstatic.net/KWV26.jpg), and downloads the first image from the first URL.

This is all great, but I want to download all photos, not just the first. (And hopefully clean up that CSV file from the code)

Side note: I understand I don't need to create a CSV to download the images. My goal is to store all img URLs into a CSV, then download the images from the URLs in the CSV

Anything helps!

from bs4 import BeautifulSoup
from time import sleep
import urllib.request
import pandas as pd
import requests
import urllib
import base64
import csv
import time





# Get site
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
page = driver.page_source
soup = BeautifulSoup(page)
# Gets srcs from all <img> from site 
srcs = [img['src'] for img in soup.findAll('img')]




# BELOW code = Writer writes all urls WITH comma after them

print ('Downloading URLs to file')
sleep(1)
with open('output.csv', 'w', newline='\n', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(srcs)



# Below is the code that only downloads the image from the first url. I intend for the code to download all images from all urls

print ('Downloading images to folder')
sleep(1)

filename = "output"

with open("{0}.csv".format(filename), 'r') as csvfile:
    # iterate on all lines
    i = 0
    for line in csvfile:
        splitted_line = line.split(',')
        # check if we have an image URL
        if splitted_line[1] != '' and splitted_line[1] != "\n":
            urllib.request.urlretrieve(splitted_line[1], "img_" + str(i) + ".png")
            print ("Image saved for {0}".format(splitted_line[0]))
            i += 1
        else:
            print ("No result for {0}".format(splitted_line[0]))

Solution

Here's another solution that keeps the CSV.

from bs4 import BeautifulSoup
from time import sleep
import urllib.request
import pandas as pd
import requests
import urllib
import base64
import csv
import time


# Get site
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
    
#page = driver.page_source
page = "https://unsplash.com/"
r = requests.get(page)
soup = BeautifulSoup(r.text, "html.parser")
# Gets srcs from all <img> from site 
srcs = [img['src'] for img in soup.findAll('img')]

# BELOW code = Writer writes all urls WITH comma after them

print ('Downloading URLs to file')
sleep(1)
with open('output.csv', 'w', newline='\n', encoding='utf-8') as csvfile:
#    writer = csv.writer(csvfile)
    for i,s in enumerate(srcs):  # each image number and URL
       csvfile.write(str(i) +','+s+'\n')

# Below is the code that only downloads the image from the first url. I intend for the code to download all images from all urls

print ('Downloading images to folder')
sleep(1)

filename = "output"

with open("{0}.csv".format(filename), 'r') as csvfile:
    # iterate on all lines
    i = 0
    for line in csvfile:
        splitted_line = line.split(',')
        # check if we have an image URL
        if splitted_line[1] != '' and splitted_line[1] != "\n":
            urllib.request.urlretrieve(splitted_line[1], "img_" + str(i) + ".png")
            print ("Image saved for {0}".format(splitted_line[0]))
            i += 1
        else:
            print ("No result for {0}".format(splitted_line[0]))

Output (output.csv)

0,https://sb.scorecardresearch.com/p?c1=2&c2=32343279&cv=2.0&cj=1
1,https://images.unsplash.com/photo-1597523565663-916cf059f524?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format%2Ccompress&fit=crop&w=1000&h=1000
2,https://images.unsplash.com/profile-1574526450714-e5d331168827image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
3,https://images.unsplash.com/photo-1599687350404-88b32c067289?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80
4,https://images.unsplash.com/profile-1583427783052-3da8ceab5579image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
5,https://images.unsplash.com/photo-1600181957705-92f267a2740e?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80
6,https://images.unsplash.com/profile-1545567671893-842f479b15e2?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
7,https://images.unsplash.com/photo-1600187723541-04457a98cc47?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80
8,https://images.unsplash.com/photo-1599687350404-88b32c067289?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80
9,https://images.unsplash.com/photo-1600181957705-92f267a2740e?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80
10,https://images.unsplash.com/photo-1600187723541-04457a98cc47?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80