I tried several codes, this is the latest, to download any file has a CSV extension from a provided URL. It does download the file, however it is corrupted. Here is the code:
import requests
import os
import tarfile
import h5py
from tqdm import tqdm
import argparse
import pandas as pd
def download_main(
url,
save_path="/home/mohammed/spatialprot/store",
file_format=None
):
try:
# Send an HTTP GET request to the URL to retrieve the file content.
response = requests.get(url, stream=True) #, params={"format": "csv"}
response.raise_for_status() # Check for any errors in the request
# Open a local file for writing in binary mode.
with open(save_path, "wb") as file:
for chunk in tqdm(response.iter_content(chunk_size=8192)):
if chunk: # Filter out keep-alive new chunks.
file.write(chunk)
print(f"Downloaded {url}, saved to {save_path}")
csv = pd.read_csv(save_path)
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
except Exception as e:
print(f"An error occurred: {e}")
If you examine the HTML document, you'll see that the downloadable files are in a ZIP file. Actually, there's only one file in the zip in this case. You can access it like this:
import requests
import zipfile
import tempfile
URL = "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/mpjzbtfgfr-1.zip"
CHUNK = 32 * 1024
with requests.get(URL, stream=True) as response:
response.raise_for_status()
with tempfile.TemporaryFile() as outdata:
for chunk in response.iter_content(chunk_size=CHUNK):
outdata.write(chunk)
outdata.flush()
with zipfile.ZipFile(outdata) as zip:
print("Extracting", zip.namelist())
zip.extractall()
The extracted file (CRC_clusters_neighborhoods_markers.csv) will be in your current working directory