I have below code from the page. It works very well and prints page content. But when I change r to a google news page (commented elow), I get an error(IOError Traceback (most recent call last)
). Why? How can I use beautifulsoup with google news pages?
code that runs fine:
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts').read()
#r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
soup = BeautifulSoup(r)
print type(soup)
print soup.prettify()
code that creates errors:
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
soup = BeautifulSoup(r)
print type(soup)
print soup.prettify()
---------------------------------------------------------------------------
IOError Traceback (most recent call last)
c:\users\abc\appdata\local\temp\tmpvxie2e.py in <module>()
2 import urllib
3 r = urllib.urlopen('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts').read()
----> 4 r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
5 soup = BeautifulSoup(r)
6 print type(soup)
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in urlopen(url, data, proxies, context)
85 opener = _urlopener
86 if data is None:
---> 87 return opener.open(url)
88 else:
89 return opener.open(url, data)
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in open(self, fullurl, data)
211 try:
212 if data is None:
--> 213 return getattr(self, name)(url)
214 else:
215 return getattr(self, name)(url, data)
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\urllib.pyc in open_https(self, url, data)
441 if realhost: h.putheader('Host', realhost)
442 for args in self.addheaders: h.putheader(*args)
--> 443 h.endheaders(data)
444 errcode, errmsg, headers = h.getreply()
445 fp = h.getfile()
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in endheaders(self, message_body)
1047 else:
1048 raise CannotSendHeader()
-> 1049 self._send_output(message_body)
1050
1051 def request(self, method, url, body=None, headers={}):
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in _send_output(self, message_body)
891 msg += message_body
892 message_body = None
--> 893 self.send(msg)
894 if message_body is not None:
895 #message_body was not a string (i.e. it is a file) and
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in send(self, data)
853 if self.sock is None:
854 if self.auto_open:
--> 855 self.connect()
856 else:
857 raise NotConnected()
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\httplib.pyc in connect(self)
1272
1273 self.sock = self._context.wrap_socket(self.sock,
-> 1274 server_hostname=server_hostname)
1275
1276 __all__.append("HTTPSConnection")
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
350 suppress_ragged_eofs=suppress_ragged_eofs,
351 server_hostname=server_hostname,
--> 352 _context=self)
353
354 def set_npn_protocols(self, npn_protocols):
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in __init__(self, sock, keyfile, certfile, server_side, cert_reqs, ssl_version, ca_certs, do_handshake_on_connect, family, type, proto, fileno, suppress_ragged_eofs, npn_protocols, ciphers, server_hostname, _context)
577 # non-blocking
578 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
--> 579 self.do_handshake()
580
581 except (OSError, ValueError):
C:\Users\abc\AppData\Local\Enthought\Canopy\App\appdata\canopy-1.6.2.3262.win-x86_64\lib\ssl.pyc in do_handshake(self, block)
806 if timeout == 0.0 and block:
807 self.settimeout(None)
--> 808 self._sslobj.do_handshake()
809 finally:
810 self.settimeout(timeout)
IOError: [Errno socket error] EOF occurred in violation of protocol (_ssl.c:590)
As suggested in comments i tried below code, but still facing the same issue :(
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
import ssl
class MyAdapter(HTTPAdapter):
def init_poolmanager(self, connections, maxsize, block=False):
self.poolmanager = PoolManager(num_pools=connections,
maxsize=maxsize,
block=block,
ssl_version=ssl.PROTOCOL_TLSv1)
import requests
s = requests.Session()
s.mount('https://', MyAdapter())
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
class SSLAdapter(HTTPAdapter):
'''An HTTPS Transport Adapter that uses an arbitrary SSL version.'''
def __init__(self, ssl_version=None, **kwargs):
self.ssl_version = ssl_version
super(SSLAdapter, self).__init__(**kwargs)
def init_poolmanager(self, connections, maxsize, block=False):
self.poolmanager = PoolManager(num_pools=connections,
maxsize=maxsize,
block=block,
ssl_version=self.ssl_version)
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('https://www.google.com/#q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d').read()
soup = BeautifulSoup(r)
print type(soup)
print soup.prettify()
So, the issue arises due to using the enthought canopy version of Python. In most versions of Python urllib doesn't check or verify an SSL certificate. In the canopy version it seems that it wants to check the SSL certificate. I was unsuccessful in finding documentation on how this is implemented.
Also, you'll see in the code below that I added the html.parser
argument to the BeautifulSoup call. It would have worked the way you had it, but the setup in BeautifulSoup4 has changed and it's best practice to pass an argument of which parser you would like to use.
Below is a working version of your code that is able to fetch an SSL version of the Google news page you wanted:
from bs4 import BeautifulSoup
import requests
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
#r = requests.get('http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts', headers=headers)
r = requests.get('https://www.google.com/search?q=%22eric+bledsoe%22&tbm=nws&tbs=qdr:d', headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
print type(soup)
print soup.prettify()