Search code examples
pythonpandasweb-scrapingbeautifulsoupcensus

Web Scraping Census Data


I am trying to web scrape data from the first table in the educational attainment section of the statistical atlas website that is based on census data. Essentially, I want to web scrape the percentages from the table and then add those percentages to a data frame that has zip codes at the very left and have separate columns for HS, no HS, and higher degree. I am trying to do this for all of the zip codes in NY City.

This is the code I have come up with so far can you help me refine it so that I can cycle through all of the zip codes and get a data frame with columns for each educational category from the first table with the zip codes in New York City?

Here is the link to statistical atlas: https://statisticalatlas.com/place/New-York/New-York/Overview

import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
import getpass
import os
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen

file_name = 'C:/Users/Nicholas_G/Desktop/Google Drive/Work/Free 
Lance/Political Targeting/Census Data.xlsx'
sheet_name = 'NYC Zip Target'
Census_Data = pd.read_excel(file_name, sheet_name=sheet_name)

zip_list = list(a for a in Census_Data['RESIDENTIAL_ZIP'])

url = "https://statisticalatlas.com/place/New-York/New-York/Overview"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
l = []

for a in zip_list:
   r = requests.get(f"https://statisticalatlas.com/zip/{a}/Educational- 
Attainment")
   s = BeautifulSoup(r.text, 'lxml')
   data = s.find('svg', {'viewBox': '0 0 400 79'})
   value = data.find('svg', {'fill': '#000'})
   l.append(value)

Solution

  • I am not that familiar with multiprocessing otherwise I would go down that route but here is my version using Session

    import requests
    import pandas as pd
    from bs4 import BeautifulSoup
    
    urlMain = 'https://statisticalatlas.com/place/New-York/New-York/Overview'
    urlAttainment = 'https://statisticalatlas.com/zip/{}/Educational-Attainment'
    
    def getPercentages(url):
        res = requests.get(url)
        if res.status_code == 200:
            soup = BeautifulSoup(res.content, "lxml")
            percentages = soup.select('[id="figure/educational-attainment"] rect title')
            percentages = [percentages[0].text,percentages[2].text,percentages[4].text]
            return percentages
        else:
            print(res.status_code, url)
            return []
    
    def getCodes(url):
        res = requests.get(url)
        soup = BeautifulSoup(res.content, "lxml")
        codes = [code.text for code in soup.select('.info-table-contents-div a[href*=zip]')]
        return codes
    
    results = []
    
    with requests.Session() as s:
        zipcodes = getCodes(urlMain)
    
        for zipcode in zipcodes:
            try:
                row = getPercentages(urlAttainment.format(zipcode))
                row.insert(0, zipcode)
                results.append(row)
            except IndexError as ex:
                print(ex,urlAttainment.format(zipcode))
    df = pd.DataFrame(results,columns=['zipcode', 'HD', 'HS', 'NoHS'])
    print(df)