I am trying to webscrape the particular text in the following but I keep getting errors. I am also unable to print out to CSV file.
My goal is to webscrape the Athletes' nicknames, full names, weight, and record for every athlete based on the HTML tags. Here is my code:
from bs4 import BeautifulSoup
import requests
from csv import writer
import json
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/104.0.0.0 Safari/537.36'
}
url="https://www.ufc.com/athletes/all"
page = requests.get(url,headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('div', class_="c-listing-athlete__text")
with open(r"UFCstats.csv", 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Nickname', 'Fullname', 'Weight', 'Record']
thewriter.writerow(header)
for list in lists:
nickname = list.find('span', class_="c-listing-athlete__nickname").find('div', class_="field__item").text
"""
fullName = list.find('span', class_="c-listing-athlete__name")
weight = list.find('div', class_="field__item").text
record = list.find('div.span.span', class_="c-listing-athlete__record")
info =[nickname, fullName, weight, record]
"""
info =[nickname]
print(info)
thewriter.writerow(info)
Error:
AttributeError: 'NoneType' object has no attribute 'text'
The URL page source code:
Source Code of URL Please assist thanks
EDIT Thanks fellow Stackoverflow scholar Abdullah Mughal, I was able to webscrap the UFC Athletes information onto CSV file. However, my next goal is to stop the webscrap when the variable fullname is empty. However, it keeps webscrapping even though I have set the len(info) !=0 such that info=[nickname, fullname, weight, record].
The webscrap does not stop. I even tried to put a while loop such that the it continues to loop until it encounters and empty class.
Here is my new improved code:
from cmath import inf
from bs4 import BeautifulSoup
import requests
from csv import writer
import pandas as pd
import numpy as np
from time import sleep
from random import randint
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome / 104.0.0.0 Safari / 537.36'
}
url = "https://www.ufc.com/athletes/all?gender=All&search=&page="
not_empty = True
endpage = 500
while not_empty:
pages = np.arange(1,endpage,1)
for page in pages:
url1 = url + str(page)
page = requests.get(url1)
soup = BeautifulSoup(page.content, 'html.parser')
'''sleep(randint(2,8))'''
lists = soup.find_all('div', class_="c-listing-athlete__text")
if lists is None:
not_empty=False
else:
f = open(r"UFCstats.csv", 'a', encoding='utf8', newline='')
csv_writer = writer(f)
header = ['Nickname', 'Fullname', 'Weight', 'Record']
csv_writer.writerow(header)
for athlete_card in lists:
nickname = ""
fullname = ""
weight = ""
record = ""
athlete_name = athlete_card.find('span', class_="c-listing-athlete__nickname")
if athlete_name is not None:
nickname = athlete_name.text.strip()
athlete_fullname = athlete_card.find('span', class_="c-listing-athlete__name")
if athlete_fullname is not None:
fullname = athlete_fullname.text.strip()
athlete_weight = athlete_card.find('span', class_="c-listing-athlete__title")
if athlete_weight is not None:
weight = athlete_weight.text.strip()
athlete_record = athlete_card.find('span', class_="c-listing-athlete__record")
if athlete_record is not None:
record = athlete_record.text.strip()
info = [nickname, fullname, weight, record]
if len(info) !=0:
print([nickname, fullname, weight, record])
csv_writer.writerow(info)
not_empty=True
print(not_empty)
f.close()
Please assist
Thanks so much
In some cases, soup is not able to find text
against objects, that's why the exception is observed. Try following code, if it resolves your issue
from bs4 import BeautifulSoup
import requests
from csv import writer
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome / 104.0.0.0 Safari / 537.36'
}
url = "https://www.ufc.com/athletes/all"
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('div', class_="c-listing-athlete__text")
f = open(r"UFCstats.csv", 'w', encoding='utf8', newline='')
csv_writer = writer(f)
header = ['Nickname', 'Fullname', 'Weight', 'Record']
csv_writer.writerow(header)
for athlete_card in lists:
nickname = ""
fullname = ""
weight = ""
record = ""
athlete_name = athlete_card.find('span', class_="c-listing-athlete__nickname")
if athlete_name is not None:
nickname = athlete_name.text.strip()
athlete_fullname = athlete_card.find('span', class_="c-listing-athlete__name")
if athlete_fullname is not None:
fullname = athlete_fullname.text.strip()
athlete_weight = athlete_card.find('span', class_="c-listing-athlete__title")
if athlete_weight is not None:
weight = athlete_weight.text.strip()
athlete_record = athlete_card.find('span', class_="c-listing-athlete__record")
if athlete_record is not None:
record = athlete_record.text.strip()
print([nickname, fullname, weight, record])
info = [nickname, fullname, weight, record]
csv_writer.writerow(info)
f.close()
EDIT 1
Problem in your code is that you have defined a for
loop within while
loop, it will be executed 500 times irrespective of whether you are setting the value of not_empty
true
or false
, you need to include break
if certain conditions met to stop for
loop, so your variable not_empty
get validation in while
loop.
Anyhow, I have made a few changes in your code, and based on your request it will get terminated when lists are empty, I have added the start_page
variable to 247
(to check) you can reset it to 1
.
from cmath import inf
from bs4 import BeautifulSoup
import requests
from csv import writer
# import pandas as pd
import numpy as np
from time import sleep
from random import randint
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome / 104.0.0.0 Safari / 537.36'
}
url = "https://www.ufc.com/athletes/all?gender=All&search=&page="
not_empty = True
end_page = 500
start_page = 247
while not_empty:
pages = np.arange(start_page, end_page, 1)
for page in pages:
url1 = url + str(page)
page = requests.get(url1)
soup = BeautifulSoup(page.content, 'html.parser')
'''sleep(randint(2,8))'''
lists = soup.find_all('div', class_="c-listing-athlete__text")
if len(lists) > 0:
f = open(r"UFCstats.csv", 'a', encoding='utf8', newline='')
csv_writer = writer(f)
header = ['Nickname', 'Fullname', 'Weight', 'Record']
csv_writer.writerow(header)
for athlete_card in lists:
nickname = ""
fullname = ""
weight = ""
record = ""
athlete_name = athlete_card.find('span', class_="c-listing-athlete__nickname")
if athlete_name is not None:
nickname = athlete_name.text.strip()
athlete_fullname = athlete_card.find('span', class_="c-listing-athlete__name")
if athlete_fullname is not None:
fullname = athlete_fullname.text.strip()
athlete_weight = athlete_card.find('span', class_="c-listing-athlete__title")
if athlete_weight is not None:
weight = athlete_weight.text.strip()
athlete_record = athlete_card.find('span', class_="c-listing-athlete__record")
if athlete_record is not None:
record = athlete_record.text.strip()
info = [nickname, fullname, weight, record]
if len(info) != 0:
print([nickname, fullname, weight, record])
csv_writer.writerow(info)
f.close()
else:
not_empty = False
break
print(not_empty)