Search code examples
pythonpandasdataframebeautifulsoupxlsx

Appending results with Panda and BeautifulSoup


PROBLEM: I have a list of sites that I want BS and Pandas to grab a data table for. I want to add all the iterative results to the same xlsx or csv file.

My current code below will iterate over each of the 3 sites, but the final product is just the last page to get scraped. Removing my export function and just printing df, I can see all 3 pages of data; so I'm not sure how to correctly append each iteration into my output file.

from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import gmtime, strftime

#Pass in the URL
url = ["https://www.nfl.com/standings/league/2021/reg", "https://www.nfl.com/standings/league/2020/reg", "https://www.nfl.com/standings/league/2019/reg"]

    for site in url:
        #Load the page html
        page = requests.get(site)
        soup = BeautifulSoup(page.text, 'lxml')
    
        # Get all the table data
        table = soup.find('table', {'summary':'Standings - Detailed View'})
    
        headers = []
    
        for i in table.find_all('th'):
            title = i.text.strip()
            headers.append(title)
    
        #Dataframe the headers into columns
        df = pd.DataFrame(columns = headers)
    
        # TR for the rows, TD for the values
        for row in table.find_all('tr')[1:]:
            data = row.find_all('td')
            row_data = [td.text.strip() for td in data]
            length = len(df)
            df.loc[length] = row_data
    
    
    
        #Write the collected data out to an Excel file
        dateTime = strftime("%d%b%Y_%H%M", gmtime())
        writer = pd.ExcelWriter(dateTime + "Z" + ".xlsx")
        df.to_excel(writer)
        writer.save()
        print('[*] Data successfully written to Excel File.')

Solution

  • Try the following. You need to capture all the dataframes from each url, then concatenate them, then write the new df to excel. This should work, but untested. See comments inline.

    from bs4 import BeautifulSoup
    import requests
    import pandas as pd
    from time import gmtime, strftime
    
    #Pass in the URL
    url = ["https://www.nfl.com/standings/league/2021/reg", "https://www.nfl.com/standings/league/2020/reg", "https://www.nfl.com/standings/league/2019/reg"]
    df_hold_list = [] #collect each dataframe separately
    
    for site in url:
        #Load the page html
        page = requests.get(site)
        soup = BeautifulSoup(page.text, 'lxml')
    
        # Get all the table data
        table = soup.find('table', {'summary':'Standings - Detailed View'})
    
        headers = []
    
        for i in table.find_all('th'):
            title = i.text.strip()
            headers.append(title)
    
        #Dataframe the headers into columns
        df = pd.DataFrame(columns = headers)
    
        # TR for the rows, TD for the values
        for row in table.find_all('tr')[1:]:
            data = row.find_all('td')
            row_data = [td.text.strip() for td in data]
            length = len(df)
            df.loc[length] = row_data
        
        df_hold_list.append(df) # add the dfs to the list
        
    final_df = pd.concat(df_hold_list, axis=1) # put them together-check that axis=1 is correct, otherwise axis=0
        
    # move this out of loop    
    #Write the collected data out to an Excel file
    dateTime = strftime("%d%b%Y_%H%M", gmtime())
    writer = pd.ExcelWriter(dateTime + "Z" + ".xlsx")
    final_df.to_excel(writer) # write final_df to excel
    writer.save()
    print('[*] Data successfully written to Excel File.')