Search code examples
pythonweb-scrapingbeautifulsouppandas-datareader

HTTPError: Internal Server Error while scraping a website


I have tried to fix my code in a lot of ways, but I keep getting a HTTPError: Internal Server Error.

Someone told me to use pd.read_html instead of pd.read_excel since the excel one gave me an error about excel file type, but I'm not sure what to do at this point.

Either way I'm getting an error.

import urllib3
import requests
from openpyxl import load_workbook
from bs4 import BeautifulSoup
import pandas as pd
import xlrd

dataframe=[]
url = "https://vialidad.mop.gob.cl/Paginas/PasadasVehiculares.aspx"
url1="https://vialidad.mop.gob.cl"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
rawpage = requests.get(url,headers=headers)
soup = BeautifulSoup(rawpage.content, 'html5lib')

for link in soup.select('a[href*=".xls"]'):
   
    s=url1+link["href"]
    print(s)
    c = pd.read_excel(s)
    print(c)

Solution

  • I'm unsure why pd.read_excel is raising a HTTPError, but as a workaround, you can first fetch the Excel file with requests.get and then load it through read_excel by creating an in-memory file object using BytesIO.

    from io import BytesIO
    
    import pandas as pd
    import requests
    import xlrd
    from bs4 import BeautifulSoup
    from openpyxl import load_workbook
    from requests.exceptions import HTTPError
    
    dataframe = []
    url = "https://vialidad.mop.gob.cl/Paginas/PasadasVehiculares.aspx"
    url1 = "https://vialidad.mop.gob.cl"
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    rawpage = requests.get(url,headers=headers)
    soup = BeautifulSoup(rawpage.content, 'html5lib')
    
    for link in soup.select('a[href*=".xls"]'):
    
        s = url1 + link["href"]
        print(s)
        try:
            r = requests.get(s)
            r.raise_for_status()
        except HTTPError as e: # Some of the .xls links throw 401 errors
            print(e.response.status_code, "error for", s)
            continue
        c = pd.read_excel(BytesIO(r.content))
        print(c)