Search code examples
pythonhtmlweb-scrapingnested

Unable to web-scrape nested HTML tags


I am trying to webscrape the particular text in the following but I keep getting errors. I am also unable to print out to CSV file.

My goal is to webscrape the Athletes' nicknames, full names, weight, and record for every athlete based on the HTML tags. Here is my code:

from bs4 import BeautifulSoup
import requests
from csv import writer 
import json

headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) 
Chrome/104.0.0.0 Safari/537.36'
}
url="https://www.ufc.com/athletes/all"
page = requests.get(url,headers=headers)




soup = BeautifulSoup(page.content, 'html.parser')


lists = soup.find_all('div', class_="c-listing-athlete__text")

with open(r"UFCstats.csv", 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Nickname', 'Fullname', 'Weight', 'Record']
thewriter.writerow(header)

for list in lists:
    nickname = list.find('span', class_="c-listing-athlete__nickname").find('div', class_="field__item").text
    
    """
    fullName = list.find('span', class_="c-listing-athlete__name")
    weight = list.find('div', class_="field__item").text
    record = list.find('div.span.span', class_="c-listing-athlete__record")
    info =[nickname, fullName, weight, record]
    """
    info =[nickname]
    print(info)
    thewriter.writerow(info)

Error:

AttributeError: 'NoneType' object has no attribute 'text'

The URL page source code:

Source Code of URL Please assist thanks

EDIT Thanks fellow Stackoverflow scholar Abdullah Mughal, I was able to webscrap the UFC Athletes information onto CSV file. However, my next goal is to stop the webscrap when the variable fullname is empty. However, it keeps webscrapping even though I have set the len(info) !=0 such that info=[nickname, fullname, weight, record].

The webscrap does not stop. I even tried to put a while loop such that the it continues to loop until it encounters and empty class.

Here is my new improved code:

from cmath import inf
from bs4 import BeautifulSoup
import requests
from csv import writer
import pandas as pd
import numpy as np 
from time import sleep 
from random import randint



headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)  Chrome / 104.0.0.0 Safari / 537.36'
}

url = "https://www.ufc.com/athletes/all?gender=All&search=&page="


not_empty = True
endpage = 500
while not_empty:
    pages = np.arange(1,endpage,1)
    for page in pages:

        url1 = url + str(page)
        page = requests.get(url1)


        soup = BeautifulSoup(page.content, 'html.parser')
        '''sleep(randint(2,8))'''

        lists = soup.find_all('div', class_="c-listing-athlete__text")

        if lists is None:
            not_empty=False
            
        else:
            f = open(r"UFCstats.csv", 'a', encoding='utf8', newline='')
            csv_writer = writer(f)

            header = ['Nickname', 'Fullname', 'Weight', 'Record']
            csv_writer.writerow(header)



            for athlete_card in lists:
                nickname = ""
                fullname = ""
                weight = ""
                record = ""

                athlete_name = athlete_card.find('span', class_="c-listing-athlete__nickname")
                if athlete_name is not None:
                    nickname = athlete_name.text.strip()
                athlete_fullname = athlete_card.find('span', class_="c-listing-athlete__name")
                if athlete_fullname is not None:
                    fullname = athlete_fullname.text.strip()
                athlete_weight = athlete_card.find('span', class_="c-listing-athlete__title")
                if athlete_weight is not None:
                    weight = athlete_weight.text.strip()
                athlete_record = athlete_card.find('span', class_="c-listing-athlete__record")
                if athlete_record is not None:
                    record = athlete_record.text.strip()
                
                info = [nickname, fullname, weight, record]

                if len(info) !=0:
                    print([nickname, fullname, weight, record])

                    csv_writer.writerow(info)
               
            not_empty=True

             
            
           
    
    print(not_empty)

    
f.close()

Please assist

Thanks so much


Solution

  • In some cases, soup is not able to find text against objects, that's why the exception is observed. Try following code, if it resolves your issue

    from bs4 import BeautifulSoup
    import requests
    from csv import writer
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)  Chrome / 104.0.0.0 Safari / 537.36'
    }
    url = "https://www.ufc.com/athletes/all"
    page = requests.get(url, headers=headers)
    
    soup = BeautifulSoup(page.content, 'html.parser')
    
    lists = soup.find_all('div', class_="c-listing-athlete__text")
    f = open(r"UFCstats.csv", 'w', encoding='utf8', newline='')
    csv_writer = writer(f)
    
    header = ['Nickname', 'Fullname', 'Weight', 'Record']
    csv_writer.writerow(header)
    
    for athlete_card in lists:
        nickname = ""
        fullname = ""
        weight = ""
        record = ""
    
        athlete_name = athlete_card.find('span', class_="c-listing-athlete__nickname")
        if athlete_name is not None:
            nickname = athlete_name.text.strip()
        athlete_fullname = athlete_card.find('span', class_="c-listing-athlete__name")
        if athlete_fullname is not None:
            fullname = athlete_fullname.text.strip()
        athlete_weight = athlete_card.find('span', class_="c-listing-athlete__title")
        if athlete_weight is not None:
            weight = athlete_weight.text.strip()
        athlete_record = athlete_card.find('span', class_="c-listing-athlete__record")
        if athlete_record is not None:
            record = athlete_record.text.strip()
        print([nickname, fullname, weight, record])
        info = [nickname, fullname, weight, record]
    
        csv_writer.writerow(info)
    f.close()
    

    EDIT 1

    Problem in your code is that you have defined a for loop within while loop, it will be executed 500 times irrespective of whether you are setting the value of not_empty true or false, you need to include break if certain conditions met to stop for loop, so your variable not_empty get validation in while loop.


    Anyhow, I have made a few changes in your code, and based on your request it will get terminated when lists are empty, I have added the start_page variable to 247 (to check) you can reset it to 1.

    from cmath import inf
    from bs4 import BeautifulSoup
    import requests
    from csv import writer
    # import pandas as pd
    import numpy as np
    from time import sleep
    from random import randint
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)  Chrome / 104.0.0.0 Safari / 537.36'
    }
    
    url = "https://www.ufc.com/athletes/all?gender=All&search=&page="
    
    not_empty = True
    end_page = 500
    start_page = 247
    while not_empty:
        pages = np.arange(start_page, end_page, 1)
        for page in pages:
    
            url1 = url + str(page)
            page = requests.get(url1)
    
            soup = BeautifulSoup(page.content, 'html.parser')
            '''sleep(randint(2,8))'''
    
            lists = soup.find_all('div', class_="c-listing-athlete__text")
            if len(lists) > 0:
                f = open(r"UFCstats.csv", 'a', encoding='utf8', newline='')
                csv_writer = writer(f)
    
                header = ['Nickname', 'Fullname', 'Weight', 'Record']
                csv_writer.writerow(header)
    
                for athlete_card in lists:
                    nickname = ""
                    fullname = ""
                    weight = ""
                    record = ""
    
                    athlete_name = athlete_card.find('span', class_="c-listing-athlete__nickname")
                    if athlete_name is not None:
                        nickname = athlete_name.text.strip()
                    athlete_fullname = athlete_card.find('span', class_="c-listing-athlete__name")
                    if athlete_fullname is not None:
                        fullname = athlete_fullname.text.strip()
                    athlete_weight = athlete_card.find('span', class_="c-listing-athlete__title")
                    if athlete_weight is not None:
                        weight = athlete_weight.text.strip()
                    athlete_record = athlete_card.find('span', class_="c-listing-athlete__record")
                    if athlete_record is not None:
                        record = athlete_record.text.strip()
    
                    info = [nickname, fullname, weight, record]
    
                    if len(info) != 0:
                        print([nickname, fullname, weight, record])
    
                        csv_writer.writerow(info)
                f.close()
            else:
                not_empty = False
                break
    
        print(not_empty)