I am trying to web scrape data from the first table in the educational attainment section of the statistical atlas website that is based on census data. Essentially, I want to web scrape the percentages from the table and then add those percentages to a data frame that has zip codes at the very left and have separate columns for HS, no HS, and higher degree. I am trying to do this for all of the zip codes in NY City.
This is the code I have come up with so far can you help me refine it so that I can cycle through all of the zip codes and get a data frame with columns for each educational category from the first table with the zip codes in New York City?
Here is the link to statistical atlas: https://statisticalatlas.com/place/New-York/New-York/Overview
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
import getpass
import os
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
file_name = 'C:/Users/Nicholas_G/Desktop/Google Drive/Work/Free
Lance/Political Targeting/Census Data.xlsx'
sheet_name = 'NYC Zip Target'
Census_Data = pd.read_excel(file_name, sheet_name=sheet_name)
zip_list = list(a for a in Census_Data['RESIDENTIAL_ZIP'])
url = "https://statisticalatlas.com/place/New-York/New-York/Overview"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
l = []
for a in zip_list:
r = requests.get(f"https://statisticalatlas.com/zip/{a}/Educational-
Attainment")
s = BeautifulSoup(r.text, 'lxml')
data = s.find('svg', {'viewBox': '0 0 400 79'})
value = data.find('svg', {'fill': '#000'})
l.append(value)
I am not that familiar with multiprocessing otherwise I would go down that route but here is my version using Session
import requests
import pandas as pd
from bs4 import BeautifulSoup
urlMain = 'https://statisticalatlas.com/place/New-York/New-York/Overview'
urlAttainment = 'https://statisticalatlas.com/zip/{}/Educational-Attainment'
def getPercentages(url):
res = requests.get(url)
if res.status_code == 200:
soup = BeautifulSoup(res.content, "lxml")
percentages = soup.select('[id="figure/educational-attainment"] rect title')
percentages = [percentages[0].text,percentages[2].text,percentages[4].text]
return percentages
else:
print(res.status_code, url)
return []
def getCodes(url):
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
codes = [code.text for code in soup.select('.info-table-contents-div a[href*=zip]')]
return codes
results = []
with requests.Session() as s:
zipcodes = getCodes(urlMain)
for zipcode in zipcodes:
try:
row = getPercentages(urlAttainment.format(zipcode))
row.insert(0, zipcode)
results.append(row)
except IndexError as ex:
print(ex,urlAttainment.format(zipcode))
df = pd.DataFrame(results,columns=['zipcode', 'HD', 'HS', 'NoHS'])
print(df)