I am trying to scrape data from ESPN Cricinfo using a python script available on Github. The code is the following.
import urllib.request as ur
import csv
import sys
import time
import os
import unicodedata
from urllib.parse import urlparse
from bs4 import BeautifulSoup
BASE_URL = 'http://www.espncricinfo.com'
for i in range(0, 6019):
url = 'http://search.espncricinfo.com/ci/content/match/search.html?search=first%20class;all=1;page='
soupy = BeautifulSoup(ur.urlopen(url + str(i)).read())
time.sleep(1)
for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
try:
new_host = new_host['href']
except:
continue
odiurl = BASE_URL + urlparse(new_host).geturl()
new_host = unicodedata.normalize('NFKD', new_host).encode('ascii','ignore')
print (new_host)
print (str.split(new_host, "/"))[4]
html = urllib2.urlopen(odiurl).read()
if html:
with open('espncricinfo-fc/{0!s}'.format(str.split(new_host, "/")[4]), "wb") as f:
f.write(html)
And the error is in this line.
print (str.split(new_host, "/"))[4]
TypeError: descriptor 'split' requires a 'str' object but received a 'bytes' Anyhelp from you would be apperciated. Thanks
Use
str.split(new_host.decode("utf-8"), "/")[4]
.decode("utf-8")
obviously being the most important part. That turns your byte
object to a string.
On another note, be aware that urllib2
(which you're using but not importing, by the way) is no longer used (see this). Instead, you could use from urllib.request import urlopen
.
EDIT: This is the full code that won't give you the error you described in your question. I am highlighting that because without the file previously created, the with open(...)
statement will give you a FileNotFoundError
.
import urllib.request as ur
import csv
import sys
import time
import os
import unicodedata
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from urllib.request import urlopen
BASE_URL = 'http://www.espncricinfo.com'
for i in range(0, 6019):
url = 'http://search.espncricinfo.com/ci/content/match/search.html?search=first%20class;all=1;page='
soupy = BeautifulSoup(ur.urlopen(url + str(i)).read())
time.sleep(1)
for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
try:
new_host = new_host['href']
except:
continue
odiurl = BASE_URL + urlparse(new_host).geturl()
new_host = unicodedata.normalize('NFKD', new_host).encode('ascii','ignore')
print(new_host)
print(str.split(new_host.decode("utf-8"), "/")[4])
html = urlopen(odiurl).read()
if html:
with open('espncricinfo-fc/{0!s}'.format(str.split(new_host.decode("utf-8"), "/")[4]), "wb") as f:
f.write(html)