I have tried to fix my code in a lot of ways, but I keep getting a HTTPError: Internal Server Error
.
Someone told me to use pd.read_html
instead of pd.read_excel
since the excel one gave me an error about excel file type, but I'm not sure what to do at this point.
Either way I'm getting an error.
import urllib3
import requests
from openpyxl import load_workbook
from bs4 import BeautifulSoup
import pandas as pd
import xlrd
dataframe=[]
url = "https://vialidad.mop.gob.cl/Paginas/PasadasVehiculares.aspx"
url1="https://vialidad.mop.gob.cl"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
rawpage = requests.get(url,headers=headers)
soup = BeautifulSoup(rawpage.content, 'html5lib')
for link in soup.select('a[href*=".xls"]'):
s=url1+link["href"]
print(s)
c = pd.read_excel(s)
print(c)
I'm unsure why pd.read_excel
is raising a HTTPError, but as a workaround, you can first fetch the Excel file with requests.get
and then load it through read_excel
by creating an in-memory file object using BytesIO
.
from io import BytesIO
import pandas as pd
import requests
import xlrd
from bs4 import BeautifulSoup
from openpyxl import load_workbook
from requests.exceptions import HTTPError
dataframe = []
url = "https://vialidad.mop.gob.cl/Paginas/PasadasVehiculares.aspx"
url1 = "https://vialidad.mop.gob.cl"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
rawpage = requests.get(url,headers=headers)
soup = BeautifulSoup(rawpage.content, 'html5lib')
for link in soup.select('a[href*=".xls"]'):
s = url1 + link["href"]
print(s)
try:
r = requests.get(s)
r.raise_for_status()
except HTTPError as e: # Some of the .xls links throw 401 errors
print(e.response.status_code, "error for", s)
continue
c = pd.read_excel(BytesIO(r.content))
print(c)