Search code examples
pythonpandasweb-scrapingbeautifulsouppopulation

Efficiently Scrape Website Looping Through URLs


I am trying to scrape population data for each US state for a range of years. Here is my code so far:

#Import libraries
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd

#Range of years
years = np.arange(1991,2019).tolist()

#Check to see if the website allows scraping
url = 'https://fred.stlouisfed.org/release/tables?rid=118&eid=259194'
page = requests.get(url)
page

#Set up dataframe
df = pd.DataFrame(
    {
        'State':[''], 
        'Year':[''],
        'Population':['']
    }
)


#For each year - run through this process
for i in years:
    full_link = url+'&od='+str(i)+'-01-01#'
    page = requests.get(full_link)
    soup = BeautifulSoup(page.text,'lxml')
    
    
    
    #Grab the state names
    states = soup.find_all('span',class_='fred-rls-elm-nm')
    states_list = []
    for i in states:
        state_name = i.text
        states_list.append(state_name)

    #Package into df
    states_df = pd.DataFrame(states_list,columns=['State'])
    states_df['index'] = np.arange(len(states_df))
    
    
    #Grab the population values
    pops = soup.find_all('td',class_='fred-rls-elm-vl-td')
    pop_list = []
    for i in pops:
        pop_num = i.text
        pop_list.append(pop_num)


    #Package into a df
    pop_df = pd.DataFrame(pop_list,columns=['Pop'])
    
    #Clean up population values
    pop_df['Pop'] = pop_df['Pop'].replace('\n',"",regex=True).str.strip()
    pop_df['Pop'] = pop_df['Pop'].replace(',',"",regex=True)
    pop_df = pop_df.apply(pd.to_numeric,errors='ignore')
    #Grab every third value
    pop_df = pop_df.iloc[::3, :]
    pop_df['Pop'] = (pop_df['Pop']*1000).astype(int)
    
    #Create index for joining
    pop_df['index'] = np.arange(len(pop_df))
    
    #Combine dataframes
    full_df = pd.merge(
        states_df,
        pop_df,
        how='left',
        on=['index']
    )
    
    #Add in the year
    full_df = full_df.append({
        'Year' : i
    },ignore_index=True)
    
    
    #Collect all dataframes in a list
    df_lists=[]
    for i in df_lists:
        df_lists.append(full_df)
    
#Print the list of dataframes
df_lists
    

I have been able to get this script to work when I hard code the year into the URL, but that is really inefficient. I want to be able to loop through the years in the URL and grab the population data and package together into a nice dataframe. So I would expect the dataframe to look like this:

State          Year  Population
Alabama        1991  4567
Alaska         1991  4563
Arizona        1991  4532
.........................
West Virginia  2018  3454
Wisconsin      2018  3423
Wyoming        2018  9843

Can someone help point me in the right direction? I feel like I'm close, but I'm missing one or two details somewhere.

Any help would be appreciated! Thank you!


Solution

  • I figured it out - mostly needed to differentiate the subscripts

    #Set up dataframe
    df = pd.DataFrame(
        {
            'State':[''], 
            'Year':[''],
            'Population':['']
        }
    )
    
    list_of_dfs=[]
    
    #For each year - run through this process
    for i in years:
        full_link = url+'&od='+str(i)+'-01-01#'
        page = requests.get(full_link)
        soup = BeautifulSoup(page.text,'lxml')
        
        
        #Grab the state names
        states = soup.find_all('span',class_='fred-rls-elm-nm')
        states_list = []
        for j in states:
            state_name = j.text
            states_list.append(state_name)
    
        #Package into df
        states_df = pd.DataFrame(states_list,columns=['State'])
        states_df['index'] = np.arange(len(states_df))
        
        
        #Grab the population values
        pops = soup.find_all('td',class_='fred-rls-elm-vl-td')
        pop_list = []
        for k in pops:
            pop_num = k.text
            pop_list.append(pop_num)
    
    
        #Package into a df
        pop_df = pd.DataFrame(pop_list,columns=['Pop'])
        
        #Clean up population values
        pop_df['Pop'] = pop_df['Pop'].replace('\n',"",regex=True).str.strip()
        pop_df['Pop'] = pop_df['Pop'].replace(',',"",regex=True)
        pop_df = pop_df.apply(pd.to_numeric,errors='ignore')
        #Grab every third value
        pop_df = pop_df.iloc[::3, :]
        pop_df['Pop'] = (pop_df['Pop']*1000).astype(int)
        
        #Create index for joining
        pop_df['index'] = np.arange(len(pop_df))
        
        #Combine dataframes
        full_df = pd.merge(
            states_df,
            pop_df,
            how='left',
            on=['index']
        )
        year_pop = str(i)
        full_df['Year'] = year_pop
        
        
        list_of_dfs.append(full_df)
        
    full_df = pd.concat(list_of_dfs)
    full_df