Search code examples
pythonurllibhttp-status-code-403

Error 403 when trying to download file from website using urlib


I'm trying to download these 2 files from that link but it keeps returning a 403 forbidden error, the website doesn't mention requiring any authentication to access files.


files = [
    'sm/sm.data.1.AllData',
    'ce/ce.data.0.AllCESSeries',
]

dir = os.path.abspath(os.path.dirname(__file__))
datadir = dir + "\\data"
os.chdir(datadir)

data_hostname = "http://download.bls.gov/pub/time.series/"
current_filesystem = datadir

def download_data():
    for filename in files: # Loop through the files in files dictonary
        filename_extension = filename[3:] + ".txt" # Filename munge
        data_location = data_hostname + "" + filename # file name location
        full_filepath = current_filesystem + "/" + filename_extension # full location
        print("downloading from: " + data_location)
        urllib.request.urlretrieve(data_location, full_filepath) # grab that shit
        print("download path: " + full_filepath)
        urllib.request.urlcleanup()
    print("Finished Downloading Data")

Solution

  • Your code is missing a couple of required HTTP headers. Specifically, User-Agent and Accept-Language

    Using requests (which you'll need to install if you don't already have it) you could do this:

    import requests
    from pathlib import Path
    
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
        "Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8,pt;q=0.7"
    }
    
    CHUNK = 64 * 1024
    
    data_hostname = "http://download.bls.gov/pub/time.series/"
    
    files = [
        'sm/sm.data.1.AllData',
        'ce/ce.data.0.AllCESSeries',
    ]
    
    (target_dir := Path(__file__).parent / "data").mkdir(exist_ok=True)
    
    for url in (data_hostname + file for file in files):
        with requests.get(url, headers=HEADERS, stream=True) as response:
            response.raise_for_status()
            target_file = target_dir / (url.split("/")[-1] + ".txt")
            with open(target_file, "wb") as output:
                for chunk in response.iter_content(CHUNK):
                    output.write(chunk)