python web-scraping beautifulsoup pandas-datareader

HTTPError: Internal Server Error while scraping a website

I have tried to fix my code in a lot of ways, but I keep getting a HTTPError: Internal Server Error.

Someone told me to use pd.read_html instead of pd.read_excel since the excel one gave me an error about excel file type, but I'm not sure what to do at this point.

Either way I'm getting an error.

import urllib3
import requests
from openpyxl import load_workbook
from bs4 import BeautifulSoup
import pandas as pd
import xlrd

dataframe=[]
url = "https://vialidad.mop.gob.cl/Paginas/PasadasVehiculares.aspx"
url1="https://vialidad.mop.gob.cl"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
rawpage = requests.get(url,headers=headers)
soup = BeautifulSoup(rawpage.content, 'html5lib')

for link in soup.select('a[href*=".xls"]'):
   
    s=url1+link["href"]
    print(s)
    c = pd.read_excel(s)
    print(c)

Solution

I'm unsure why pd.read_excel is raising a HTTPError, but as a workaround, you can first fetch the Excel file with requests.get and then load it through read_excel by creating an in-memory file object using BytesIO.

from io import BytesIO

import pandas as pd
import requests
import xlrd
from bs4 import BeautifulSoup
from openpyxl import load_workbook
from requests.exceptions import HTTPError

dataframe = []
url = "https://vialidad.mop.gob.cl/Paginas/PasadasVehiculares.aspx"
url1 = "https://vialidad.mop.gob.cl"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
rawpage = requests.get(url,headers=headers)
soup = BeautifulSoup(rawpage.content, 'html5lib')

for link in soup.select('a[href*=".xls"]'):

    s = url1 + link["href"]
    print(s)
    try:
        r = requests.get(s)
        r.raise_for_status()
    except HTTPError as e: # Some of the .xls links throw 401 errors
        print(e.response.status_code, "error for", s)
        continue
    c = pd.read_excel(BytesIO(r.content))
    print(c)