I am learning Python and Beautiful Soup and as an exercise I webscrape a testing webpage. My objective is to extract a url from the webpage and then follow this url to extract another url.
My code is the following:
First step:
path = "http://python-data.dr-chuck.net/known_by_Fikret.html"
pattern = re.compile(r'"(.+)"')
page = urllib2.urlopen(path)
soup = bs(page, 'lxml')
a = soup.find_all("a")
path = re.search(pattern, str(a[2])).group(0)
path
Out:
'"http://python-data.dr-chuck.net/known_by_Montgomery.html"'
Second step:
page = urllib2.urlopen(path)
soup = bs(page, 'lxml')
a = soup.find_all("a")
path = re.search(pattern, str(a[2])).group(0)
path
Out:
---------------------------------------------------------------------------
URLError Traceback (most recent call last)
<ipython-input-33-14ad9508aea0> in <module>()
----> 1 page = urllib2.urlopen(path)
2 soup = bs(page, 'lxml')
3 a = soup.find_all("a")
4 path = re.search(pattern, str(a[2])).group(0)
5 path
C:\users\alex\Anaconda2\lib\urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context)
152 else:
153 opener = _opener
--> 154 return opener.open(url, data, timeout)
155
156 def install_opener(opener):
C:\users\alex\Anaconda2\lib\urllib2.pyc in open(self, fullurl, data, timeout)
427 req = meth(req)
428
--> 429 response = self._open(req, data)
430
431 # post-process response
C:\users\alex\Anaconda2\lib\urllib2.pyc in _open(self, req, data)
450
451 return self._call_chain(self.handle_open, 'unknown',
--> 452 'unknown_open', req)
453
454 def error(self, proto, *args):
C:\users\alex\Anaconda2\lib\urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
405 func = getattr(handler, meth_name)
406
--> 407 result = func(*args)
408 if result is not None:
409 return result
C:\users\alex\Anaconda2\lib\urllib2.pyc in unknown_open(self, req)
1264 def unknown_open(self, req):
1265 type = req.get_type()
-> 1266 raise URLError('unknown url type: %s' % type)
1267
1268 def parse_keqv_list(l):
URLError: <urlopen error unknown url type: "http>
Why urlopen does not recognize the url?
Your advice will be appreciated.
Use .group(1)
when retrieving result of regex matching. .group(0)
returns the whole matched string including quotes.