import requests
from bs4 import BeautifulSoup
url = 'https://www.basketball-reference.com/players/a/'
urlb = 'https://www.basketball-reference.com/players/b/'
urlc = 'https://www.basketball-reference.com/players/c/'
result = requests.get(url)
doc = BeautifulSoup(result.text, 'lxml')
college = doc.find_all(string="Kentucky")
result = requests.get(urlb)
doc = BeautifulSoup(result.text, 'lxml')
collegeb = doc.find_all(string='Kentucky')
result = requests.get(urlc)
doc = BeautifulSoup(result.text, 'lxml')
collegec = doc.find_all(string='Kentucky')
print(college)
print(collegeb)
print(collegec)
I need to do this for every letter of the alphabet for like at least 30 schools and I would really like to know how to do this more efficiently
Deduplicate nearly identical code with a loop over inputs and a list
or dict
of results:
import requests
from bs4 import BeautifulSoup
url_template = 'https://www.basketball-reference.com/players/{}/'
folders = ['a', 'b', 'c'] # The only varying thing in your original tripled code
colleges = [] # Store the results for each varied thing here in same order
for folder in folders: # Loop over varying component
result = requests.get(url_template.format(folder)) # Substitute it in template
doc = BeautifulSoup(result.text, 'lxml')
colleges.append(doc.find_all(string="Kentucky")) # Append result in same order
# Loop over results to print them
for college in colleges:
print(college)
If you're having it work for many schools, for every letter of the alphabet, you'd likely use a dict
(better, a defaultdict
) for the results instead of a list
(so you can group the results by school) with an inner loop parsing out data by school:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from string import ascii_lowercase
url_template = 'https://www.basketball-reference.com/players/{}/'
folders = ascii_lowercase # Will run for every lowercase alphabet letter
schoolnames = ("Kentucky", "Gonzaga", ...)
colleges = defaultdict(list) # Store a list of results for each school
for folder in folders: # Loop over varying component
result = requests.get(url_template.format(folder)) # Substitute it in template
doc = BeautifulSoup(result.text, 'lxml')
for schoolname in schoolnames:
colleges[schoolname].append(doc.find_all(schoolname=school))
# Loop over results to print them
for collegename, results in colleges.items():
print(collegename)
for result in results:
print(result)