TypeError: descriptor 'split' requires a 'str' object but received a 'bytes'

I am trying to scrape data from ESPN Cricinfo using a python script available on Github. The code is the following.

import urllib.request as ur
import csv
import sys
import time
import os
import unicodedata
from urllib.parse import urlparse
from bs4 import BeautifulSoup

BASE_URL = 'http://www.espncricinfo.com'
for i in range(0, 6019):
url = 'http://search.espncricinfo.com/ci/content/match/search.html?search=first%20class;all=1;page='
    soupy = BeautifulSoup(ur.urlopen(url + str(i)).read())

    time.sleep(1)
    for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
        try:
            new_host = new_host['href']
        except:
            continue
        odiurl = BASE_URL + urlparse(new_host).geturl()
        new_host = unicodedata.normalize('NFKD', new_host).encode('ascii','ignore')
        print (new_host)
        print (str.split(new_host, "/"))[4]
        html = urllib2.urlopen(odiurl).read()
        if html:
            with open('espncricinfo-fc/{0!s}'.format(str.split(new_host, "/")[4]), "wb") as f:
                f.write(html)

And the error is in this line.

print (str.split(new_host, "/"))[4]

TypeError: descriptor 'split' requires a 'str' object but received a 'bytes' Anyhelp from you would be apperciated. Thanks

Solution

Use

str.split(new_host.decode("utf-8"), "/")[4]

.decode("utf-8") obviously being the most important part. That turns your byte object to a string.

On another note, be aware that urllib2 (which you're using but not importing, by the way) is no longer used (see this). Instead, you could use from urllib.request import urlopen.

EDIT: This is the full code that won't give you the error you described in your question. I am highlighting that because without the file previously created, the with open(...) statement will give you a FileNotFoundError.

import urllib.request as ur
import csv
import sys
import time
import os
import unicodedata
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from urllib.request import urlopen

BASE_URL = 'http://www.espncricinfo.com'
for i in range(0, 6019):
    url = 'http://search.espncricinfo.com/ci/content/match/search.html?search=first%20class;all=1;page='
    soupy = BeautifulSoup(ur.urlopen(url + str(i)).read())

    time.sleep(1)
    for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
        try:
            new_host = new_host['href']
        except:
            continue
        odiurl = BASE_URL + urlparse(new_host).geturl()
        new_host = unicodedata.normalize('NFKD', new_host).encode('ascii','ignore')
        print(new_host)
        print(str.split(new_host.decode("utf-8"), "/")[4])
        html = urlopen(odiurl).read()
        if html:
            with open('espncricinfo-fc/{0!s}'.format(str.split(new_host.decode("utf-8"), "/")[4]), "wb") as f:
                f.write(html)