Search code examples
pythonweb-scrapingbeautifulsoupenthought

beautifulsoup error while trying to get information from google news page: enthought canopy


I have below code from the page. It works very well and prints page content. But when I change r to a google news page (commented elow), I get an error(IOError Traceback (most recent call last)). Why? How can I use beautifulsoup with google news pages?

code that runs fine:

from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts').read()
#r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
soup = BeautifulSoup(r)
print type(soup)

print soup.prettify()

code that creates errors:

from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
soup = BeautifulSoup(r)
print type(soup)

print soup.prettify()


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
c:\users\abc\appdata\local\temp\tmpvxie2e.py in <module>()
      2 import urllib
      3 r = urllib.urlopen('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts').read()
----> 4 r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
      5 soup = BeautifulSoup(r)
      6 print type(soup)

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in urlopen(url, data, proxies, context)
     85         opener = _urlopener
     86     if data is None:
---> 87         return opener.open(url)
     88     else:
     89         return opener.open(url, data)

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in open(self, fullurl, data)
    211         try:
    212             if data is None:
--> 213                 return getattr(self, name)(url)
    214             else:
    215                 return getattr(self, name)(url, data)

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in open_https(self, url, data)
    441             if realhost: h.putheader('Host', realhost)
    442             for args in self.addheaders: h.putheader(*args)
--> 443             h.endheaders(data)
    444             errcode, errmsg, headers = h.getreply()
    445             fp = h.getfile()

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in endheaders(self, message_body)
   1047         else:
   1048             raise CannotSendHeader()
-> 1049         self._send_output(message_body)
   1050 
   1051     def request(self, method, url, body=None, headers={}):

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in _send_output(self, message_body)
    891             msg += message_body
    892             message_body = None
--> 893         self.send(msg)
    894         if message_body is not None:
    895             #message_body was not a string (i.e. it is a file) and

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in send(self, data)
    853         if self.sock is None:
    854             if self.auto_open:
--> 855                 self.connect()
    856             else:
    857                 raise NotConnected()

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in connect(self)
   1272 
   1273             self.sock = self._context.wrap_socket(self.sock,
-> 1274                                                   server_hostname=server_hostname)
   1275 
   1276     __all__.append("HTTPSConnection")

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
    350                          suppress_ragged_eofs=suppress_ragged_eofs,
    351                          server_hostname=server_hostname,
--> 352                          _context=self)
    353 
    354     def set_npn_protocols(self, npn_protocols):

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in __init__(self, sock, keyfile, certfile, server_side, cert_reqs, ssl_version, ca_certs, do_handshake_on_connect, family, type, proto, fileno, suppress_ragged_eofs, npn_protocols, ciphers, server_hostname, _context)
    577                         # non-blocking
    578                         raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
--> 579                     self.do_handshake()
    580 
    581             except (OSError, ValueError):

C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in do_handshake(self, block)
    806             if timeout == 0.0 and block:
    807                 self.settimeout(None)
--> 808             self._sslobj.do_handshake()
    809         finally:
    810             self.settimeout(timeout)

IOError: [Errno socket error] EOF occurred in violation of protocol (_ssl.c:590) 

update1.

As suggested in comments i tried below code, but still facing the same issue :(

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
import ssl

class MyAdapter(HTTPAdapter):
    def init_poolmanager(self, connections, maxsize, block=False):
        self.poolmanager = PoolManager(num_pools=connections,
                                       maxsize=maxsize,
                                       block=block,
                                       ssl_version=ssl.PROTOCOL_TLSv1)


import requests
s = requests.Session()
s.mount('https://', MyAdapter())

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager

class SSLAdapter(HTTPAdapter):
    '''An HTTPS Transport Adapter that uses an arbitrary SSL version.'''
    def __init__(self, ssl_version=None, **kwargs):
        self.ssl_version = ssl_version

        super(SSLAdapter, self).__init__(**kwargs)

    def init_poolmanager(self, connections, maxsize, block=False):
        self.poolmanager = PoolManager(num_pools=connections,
                                       maxsize=maxsize,
                                       block=block,
                                       ssl_version=self.ssl_version)

from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
soup = BeautifulSoup(r)
print type(soup)

print soup.prettify()

Solution

  • So, the issue arises due to using the enthought canopy version of Python. In most versions of Python urllib doesn't check or verify an SSL certificate. In the canopy version it seems that it wants to check the SSL certificate. I was unsuccessful in finding documentation on how this is implemented.

    Also, you'll see in the code below that I added the html.parser argument to the BeautifulSoup call. It would have worked the way you had it, but the setup in BeautifulSoup4 has changed and it's best practice to pass an argument of which parser you would like to use.

    Below is a working version of your code that is able to fetch an SSL version of the Google news page you wanted:

    from bs4 import BeautifulSoup
    import requests
    
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
    
    #r = requests.get('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts', headers=headers)
    r = requests.get('https://www.google.com/search?q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d', headers=headers)
    
    soup = BeautifulSoup(r.text, "html.parser")
    print type(soup)
    
    print soup.prettify()