Search code examples
pythonhttpbeautifulsoupurllib

python: urllib.request.urlopen not working


I have a problem with urllib request because it throws this error:

HTTPError: HTTP Error 500: Internal Server Error

the code I am using is this:

from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url='https://seia.sea.gob.cl/documentos/documento.php?idDocumento=2148717266'
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')

The whole error log is this:

    HTTPError                                 Traceback (most recent call last)
<ipython-input-9-3be7085b64e6> in <module>
      5 
      6 url='https://seia.sea.gob.cl/documentos/documento.php?idDocumento=2148717266'
----> 7 html = urllib.request.urlopen(url, context=ctx).read()
      8 soup = BeautifulSoup(html, 'html.parser')

~\anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

~\anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    529         for processor in self.process_response.get(protocol, []):
    530             meth = getattr(processor, meth_name)
--> 531             response = meth(req, response)
    532 
    533         return response

~\anaconda3\lib\urllib\request.py in http_response(self, request, response)
    638         # request was successfully received, understood, and accepted.
    639         if not (200 <= code < 300):
--> 640             response = self.parent.error(
    641                 'http', request, response, code, msg, hdrs)
    642 

~\anaconda3\lib\urllib\request.py in error(self, proto, *args)
    567         if http_err:
    568             args = (dict, 'default', 'http_error_default') + orig_args
--> 569             return self._call_chain(*args)
    570 
    571 # XXX probably also want an abstract factory that knows when it makes

~\anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    500         for handler in handlers:
    501             func = getattr(handler, meth_name)
--> 502             result = func(*args)
    503             if result is not None:
    504                 return result

~\anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
    647 class HTTPDefaultErrorHandler(BaseHandler):
    648     def http_error_default(self, req, fp, code, msg, hdrs):
--> 649         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    650 
    651 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 500: Internal Server Error

Any help I would really apreciate! thanks before hand


Solution

  • Try to set User-Agent header in your request:

    from bs4 import BeautifulSoup
    from urllib.request import urlopen, Request
    
    req = Request('https://seia.sea.gob.cl/documentos/documento.php?idDocumento=2148717266')
    req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:82.0) Gecko/20100101 Firefox/82.0')
    content = urlopen(req).read()
    soup = BeautifulSoup(content, 'html.parser')
    
    print(soup.prettify())
    

    Prints:

    <html>
     <head>
      <title>
       Documento - 37/8e/3a04fd80aee93aeeb325f0ff8fa06e0a4634
      </title>
    
    ...and so on.