Search code examples
arraysbeautifulsouppython-requestsopenpyxlpandas.excelwriter

BeautifulSoup, Requests, Dataframe Saving to Excel arrays error


I am a novice at Python and helping out on a school project. Any help is much appreciated. THANKS. I get an error when it gets to the year 2004 and 2003. And it is caused by the result_list list. The error is "ValueError: arrays must all be same length". How can I introduce code that fixes this. The scores are important....

import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup
#from openpyxl.writer.excel import ExcelWriter
import openpyxl
#from openpyxl import load_workbook
import csv

year_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']

i=0
while i <= len(year_id)-1: 
    url = 'https://lehighsports.com/sports/mens-soccer/schedule/' + str(year_id[i])
    lehigh = requests.get(url).text
    soup = BeautifulSoup(lehigh,'lxml')

    date_list = []
    for date in soup.find_all('div',class_="sidearm-schedule-game-opponent-date"):
        date_list.append(date.get_text(strip=True, separator=' '))

    name_list = []
    for name in soup.find_all('div',class_="sidearm-schedule-game-opponent-name"):
        name_list.append(name.get_text(strip=True, separator=' '))

    result_list = []
    for result in soup.find_all('div',class_="sidearm-schedule-game-result"):
        result_list.append(result.get_text(strip=True, separator=' '))

    opp_list = []
    for opp in soup.find_all('div',class_="sidearm-schedule-game-opponent-text"):
        opp_list.append(opp.get_text(strip=True, separator=' '))

    conf_list = []
    for conf in soup.find_all('div',class_="sidearm-schedule-game-conference-conference"):
        conf_list.append(conf.get_text(strip=True))

    dict = {'date':date_list,'opponent':name_list,'result':result_list,'list':opp_list,'conference':conf_list}
    df = pd.DataFrame(dict)

    workbook1 = openpyxl.load_workbook('lehigh.xlsx')
    writer = pd.ExcelWriter('lehigh.xlsx', engine='openpyxl') 
    writer.book = workbook1
    df.to_excel(writer, sheet_name=str(year_id[i]),index=False,startrow=0,startcol=0)
    writer.save()
    writer.close()

    i = i+1

Solution

  • Code is updated:

    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    from itertools import zip_longest
    
    d = []
    n = []
    res = []
    op = []
    yr = []
    with requests.Session() as req:
        for year in range(2003, 2020):
            print(f"Extracting Year# {year}")
            r = req.get(
                f"https://lehighsports.com/sports/mens-soccer/schedule/{year}")
            if r.status_code == 200:
                soup = BeautifulSoup(r.text, 'html.parser')
                for date in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-date flex-item-1'}):
                    d.append(date.get_text(strip=True, separator=" "))
                for name in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-name'}):
                    n.append(name.get_text(strip=True))
                for result in soup.findAll("div", {'class': 'sidearm-schedule-game-result'}):
                    result = result.get_text(strip=True)
                    res.append(result)
                if len(d) != len(res):
                    res.append("None")
                for opp in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-text'}):
                    op.append(opp.get_text(strip=True, separator=' '))
                    yr.append(year)
    
    
    data = []
    for items in zip_longest(yr, d, n, res, op):
        data.append(items)
    
    df = pd.DataFrame(data, columns=['Year', 'Date', 'Name', 'Result', 'Opponent']).to_excel(
        'lehigh.xlsx', index=False)
    

    Output: check-online