I need help.
This code currently gets all src attributes from all <img> on the desired page, stores the URLs in a csv file (it's messy https://i.sstatic.net/KWV26.jpg), and downloads the first image from the first URL.
This is all great, but I want to download all photos, not just the first. (And hopefully clean up that CSV file from the code)
Side note: I understand I don't need to create a CSV to download the images. My goal is to store all img URLs into a CSV, then download the images from the URLs in the CSV
Anything helps!
from bs4 import BeautifulSoup
from time import sleep
import urllib.request
import pandas as pd
import requests
import urllib
import base64
import csv
import time
# Get site
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
page = driver.page_source
soup = BeautifulSoup(page)
# Gets srcs from all <img> from site
srcs = [img['src'] for img in soup.findAll('img')]
# BELOW code = Writer writes all urls WITH comma after them
print ('Downloading URLs to file')
sleep(1)
with open('output.csv', 'w', newline='\n', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(srcs)
# Below is the code that only downloads the image from the first url. I intend for the code to download all images from all urls
print ('Downloading images to folder')
sleep(1)
filename = "output"
with open("{0}.csv".format(filename), 'r') as csvfile:
# iterate on all lines
i = 0
for line in csvfile:
splitted_line = line.split(',')
# check if we have an image URL
if splitted_line[1] != '' and splitted_line[1] != "\n":
urllib.request.urlretrieve(splitted_line[1], "img_" + str(i) + ".png")
print ("Image saved for {0}".format(splitted_line[0]))
i += 1
else:
print ("No result for {0}".format(splitted_line[0]))
Here's another solution that keeps the CSV.
from bs4 import BeautifulSoup
from time import sleep
import urllib.request
import pandas as pd
import requests
import urllib
import base64
import csv
import time
# Get site
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
#page = driver.page_source
page = "https://unsplash.com/"
r = requests.get(page)
soup = BeautifulSoup(r.text, "html.parser")
# Gets srcs from all <img> from site
srcs = [img['src'] for img in soup.findAll('img')]
# BELOW code = Writer writes all urls WITH comma after them
print ('Downloading URLs to file')
sleep(1)
with open('output.csv', 'w', newline='\n', encoding='utf-8') as csvfile:
# writer = csv.writer(csvfile)
for i,s in enumerate(srcs): # each image number and URL
csvfile.write(str(i) +','+s+'\n')
# Below is the code that only downloads the image from the first url. I intend for the code to download all images from all urls
print ('Downloading images to folder')
sleep(1)
filename = "output"
with open("{0}.csv".format(filename), 'r') as csvfile:
# iterate on all lines
i = 0
for line in csvfile:
splitted_line = line.split(',')
# check if we have an image URL
if splitted_line[1] != '' and splitted_line[1] != "\n":
urllib.request.urlretrieve(splitted_line[1], "img_" + str(i) + ".png")
print ("Image saved for {0}".format(splitted_line[0]))
i += 1
else:
print ("No result for {0}".format(splitted_line[0]))
Output (output.csv)
0,https://sb.scorecardresearch.com/p?c1=2&c2=32343279&cv=2.0&cj=1
1,https://images.unsplash.com/photo-1597523565663-916cf059f524?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format%2Ccompress&fit=crop&w=1000&h=1000
2,https://images.unsplash.com/profile-1574526450714-e5d331168827image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
3,https://images.unsplash.com/photo-1599687350404-88b32c067289?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80
4,https://images.unsplash.com/profile-1583427783052-3da8ceab5579image?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
5,https://images.unsplash.com/photo-1600181957705-92f267a2740e?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80
6,https://images.unsplash.com/profile-1545567671893-842f479b15e2?auto=format&fit=crop&w=32&h=32&q=60&crop=faces&bg=fff
7,https://images.unsplash.com/photo-1600187723541-04457a98cc47?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80
8,https://images.unsplash.com/photo-1599687350404-88b32c067289?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80
9,https://images.unsplash.com/photo-1600181957705-92f267a2740e?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80
10,https://images.unsplash.com/photo-1600187723541-04457a98cc47?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&w=1000&q=80