python pandas dataframe web-scraping data-science

How to scrape a table from a page and create a multi-column dataframe with python?

This website https://aviation-safety.net/wikibase/ DB begins from year 1902 to 2022. I am trying to scrape the table, narrative, probable cause and classification for every accidents in the year 2015 and 2016: https://aviation-safety.net/database/dblist.php?Year=2015. With the below code I am able to scrape the table only:

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
import itertools
from random import randint
from time import sleep

def scraping(year):


    headers =   {
        'accept':'*/*',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        }

    url = f'https://aviation-safety.net/database/dblist.php?Year={year}&sorteer=datekey&page=1'
    #sleep(randint(1,3))
    req = requests.get(url, headers=headers)

    soup = BeautifulSoup(req.text,'html.parser')

    page_container = soup.find('div',{'class':'pagenumbers'})

    pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])
        

    #info = []
    tl = []
    for page in range(1,pages+1):

        new_url = f'https://aviation-safety.net/database/dblist.php?Year={year}&lang=&page={page}'
        print(new_url)
        
        #sleep(randint(1,3))
        data = requests.get(new_url,headers=headers)
        soup = BeautifulSoup(data.text,'html.parser')


        table = soup.find('table')
   
    
        for index,row in enumerate(table.find_all('tr')):
            if index == 0:
                continue

            link_ = 'https://aviation-safety.net/'+row.find('a')['href']
            
            #sleep(randint(1,3))
            new_page = requests.get(link_, headers=headers)
            new_soup = BeautifulSoup(new_page.text, 'lxml')
            table1 = new_soup.find('table')
            
           
            for i in table1.find_all('tr'):
                title = i.text
                tl.append(title)
                
                
    df= pd.DataFrame(tl)
    df.columns = ['status'] 
    df.to_csv(f'{year}_aviation-safety_new.csv', encoding='utf-8-sig', index=False)    
          

if __name__ == "__main__":

    START = 2015
    STOP = 2016

    years = [year for year in range(START,STOP+1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
        final_list = executor.map(scraping,years)

But the data is not organized. The dataframe looks like this:

The outcome should be like this:

Solution

It looks the values of tl are strings, e.g. 'Status:Accident investigation report completed and information captured'.

Converting the list of strings into a pd.DataFrame gets you a single column with all the values in the list.

If you want to use the "name" of the string, e.g. Status as a column header, you'll need to separate it from the rest of the text.

# maxsplit of 1 so we don't accidentally split up the values, e.g. time
title, text = title.split(":", maxsplit=1)

This looks like

('Status', 'Accident investigation report completed and information captured')

Now we create a dictionary

row_dict[title] = text

Giving us

{'Status': 'Accident investigation report completed and information captured'}

We will add to this same dictionary in the last loop

# old
for i in table1.find_all('tr'):
    title = i.text
    tl.append(title)

# new
row_dict = {}
for i in table1.find_all('tr'):
    title = i.text
    title, text = title.split(":", maxsplit=1)
    row_dict[title] = text

After we've gathered all the data from page, i.e. completed the row_dict loop, we append to tl.

row_dict = {}
for i in table1.find_all('tr'):
    title = i.text
    title, text = title.split(":", maxsplit=1)
    row_dict[title] = text

tl.append(row_dict)

All together now

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
import itertools
from random import randint
from time import sleep

def scraping(year):


    headers =   {
        'accept':'*/*',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        }

    url = f'https://aviation-safety.net/database/dblist.php?Year={year}&sorteer=datekey&page=1'
    #sleep(randint(1,3))
    req = requests.get(url, headers=headers)

    soup = BeautifulSoup(req.text,'html.parser')

    page_container = soup.find('div',{'class':'pagenumbers'})

    pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])
        

    #info = []
    tl = []
    for page in range(1,pages+1):

        new_url = f'https://aviation-safety.net/database/dblist.php?Year={year}&lang=&page={page}'
        print(new_url)
        
        #sleep(randint(1,3))
        data = requests.get(new_url,headers=headers)
        soup = BeautifulSoup(data.text,'html.parser')


        table = soup.find('table')
   
    
        for index,row in enumerate(table.find_all('tr')):
            if index == 0:
                continue

            link_ = 'https://aviation-safety.net/'+row.find('a')['href']
            
            #sleep(randint(1,3))
            new_page = requests.get(link_, headers=headers)
            new_soup = BeautifulSoup(new_page.text, 'lxml')
            table1 = new_soup.find('table')
            
            # make changes here!!!!!!!
            row_dict = {}
            for i in table1.find_all('tr'):
                title = i.text
                title, text = title.split(":", maxsplit=1)
                row_dict[title] = text
            
            tl.append(row_dict)
                
    df= pd.DataFrame(tl)
    df.to_csv(f'{year}_aviation-safety_new.csv', encoding='utf-8-sig', index=False)    
          

if __name__ == "__main__":

    START = 2015
    STOP = 2016

    years = [year for year in range(START,STOP+1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
        final_list = executor.map(scraping,years)