I have to write in Python that performs the following tasks:
1- Download the Movielens datasets from the url ‘http://files.grouplens.org/datasets/movielens/ml-
25m.zip’
2- Download the Movielens checksum from the url ‘http://files.grouplens.org/datasets/movielens/ml-
25m.zip.md5’
3- Check whether the checksum of the archive corresponds to the downloaded one
4- In case of positive check, print the names of the files contained by the downloaded archive
This is what I wrote up to now:
from zipfile import ZipFile
from urllib import request
import hashlib
def md5(fname):
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
url_datasets = 'http://files.grouplens.org/datasets/movielens/ml-25m.zip'
datasets = 'datasets.zip'
url_checksum = 'http://files.grouplens.org/datasets/movielens/ml-25m.zip.md5'
request.urlretrieve( url_datasets, datasets)
request.urlretrieve (url_checksum, checksum)
checksum = 'datasets.zip.md5'
with ZipFile(datasets, 'r') as zipObj:
listOfiles = zipObj.namelist()
for elem in listOfiles:
print(elem)
So what I'm missing is a way to compare the checksum I computed with the one I downloaded and maybe I can create a function "printFiles" that checks the checksum and in the positive case prints the list of files.
Is there something else I can improve?
Your code isn't actually making any of the requests.
from zipfile import ZipFile
import hashlib
import requests
def md5(fname):
hash_md5 = hashlib.md5()
hash_md5.update( open(fname,'rb').read() )
return hash_md5.hexdigest()
url_datasets = 'http://files.grouplens.org/datasets/movielens/ml-25m.zip'
datasets = 'datasets.zip'
url_checksum = 'http://files.grouplens.org/datasets/movielens/ml-25m.zip.md5'
checksum = 'datasets.zip.md5'
ds = requests.get( url_datasets, allow_redirects=True)
cs = requests.get( url_checksum, allow_redirects=True)
open( datasets, 'wb').write( ds.content )
ds_md5 = md5(datasets)
cs_md5 = cs.content.decode('utf-8').split()[0]
print( ds_md5 )
print( cs_md5 )
if ds_md5 == cs_md5:
print( "MATCH" )
with ZipFile(datasets, 'r') as zipObj:
listOfiles = zipObj.namelist()
for elem in listOfiles:
print(elem)
else:
print( "Checksum fail" )