Search code examples
pythonpandasdataframeweb-scrapingdata-science

How to scrape a table from a page and create a multi-column dataframe with python?


This website https://aviation-safety.net/wikibase/ DB begins from year 1902 to 2022. I am trying to scrape the table, narrative, probable cause and classification for every accidents in the year 2015 and 2016: https://aviation-safety.net/database/dblist.php?Year=2015. With the below code I am able to scrape the table only:

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
import itertools
from random import randint
from time import sleep

def scraping(year):


    headers =   {
        'accept':'*/*',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        }

    url = f'https://aviation-safety.net/database/dblist.php?Year={year}&sorteer=datekey&page=1'
    #sleep(randint(1,3))
    req = requests.get(url, headers=headers)

    soup = BeautifulSoup(req.text,'html.parser')

    page_container = soup.find('div',{'class':'pagenumbers'})

    pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])
        

    #info = []
    tl = []
    for page in range(1,pages+1):

        new_url = f'https://aviation-safety.net/database/dblist.php?Year={year}&lang=&page={page}'
        print(new_url)
        
        #sleep(randint(1,3))
        data = requests.get(new_url,headers=headers)
        soup = BeautifulSoup(data.text,'html.parser')


        table = soup.find('table')
   
    
        for index,row in enumerate(table.find_all('tr')):
            if index == 0:
                continue

            link_ = 'https://aviation-safety.net/'+row.find('a')['href']
            
            #sleep(randint(1,3))
            new_page = requests.get(link_, headers=headers)
            new_soup = BeautifulSoup(new_page.text, 'lxml')
            table1 = new_soup.find('table')
            
           
            for i in table1.find_all('tr'):
                title = i.text
                tl.append(title)
                
                
    df= pd.DataFrame(tl)
    df.columns = ['status'] 
    df.to_csv(f'{year}_aviation-safety_new.csv', encoding='utf-8-sig', index=False)    
          

if __name__ == "__main__":

    START = 2015
    STOP = 2016

    years = [year for year in range(START,STOP+1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
        final_list = executor.map(scraping,years)

But the data is not organized. The dataframe looks like this:

enter image description here

The outcome should be like this:

enter image description here


Solution

  • It looks the values of tl are strings, e.g. 'Status:Accident investigation report completed and information captured'.

    Converting the list of strings into a pd.DataFrame gets you a single column with all the values in the list.

    If you want to use the "name" of the string, e.g. Status as a column header, you'll need to separate it from the rest of the text.

    # maxsplit of 1 so we don't accidentally split up the values, e.g. time
    title, text = title.split(":", maxsplit=1)
    

    This looks like

    ('Status', 'Accident investigation report completed and information captured')
    

    Now we create a dictionary

    row_dict[title] = text
    

    Giving us

    {'Status': 'Accident investigation report completed and information captured'}
    

    We will add to this same dictionary in the last loop

    # old
    for i in table1.find_all('tr'):
        title = i.text
        tl.append(title)
    
    # new
    row_dict = {}
    for i in table1.find_all('tr'):
        title = i.text
        title, text = title.split(":", maxsplit=1)
        row_dict[title] = text
    

    After we've gathered all the data from page, i.e. completed the row_dict loop, we append to tl.

    row_dict = {}
    for i in table1.find_all('tr'):
        title = i.text
        title, text = title.split(":", maxsplit=1)
        row_dict[title] = text
    
    tl.append(row_dict)
    

    All together now

    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    from datetime import datetime
    import re
    import concurrent.futures
    import itertools
    from random import randint
    from time import sleep
    
    def scraping(year):
    
    
        headers =   {
            'accept':'*/*',
            'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
            }
    
        url = f'https://aviation-safety.net/database/dblist.php?Year={year}&sorteer=datekey&page=1'
        #sleep(randint(1,3))
        req = requests.get(url, headers=headers)
    
        soup = BeautifulSoup(req.text,'html.parser')
    
        page_container = soup.find('div',{'class':'pagenumbers'})
    
        pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])
            
    
        #info = []
        tl = []
        for page in range(1,pages+1):
    
            new_url = f'https://aviation-safety.net/database/dblist.php?Year={year}&lang=&page={page}'
            print(new_url)
            
            #sleep(randint(1,3))
            data = requests.get(new_url,headers=headers)
            soup = BeautifulSoup(data.text,'html.parser')
    
    
            table = soup.find('table')
       
        
            for index,row in enumerate(table.find_all('tr')):
                if index == 0:
                    continue
    
                link_ = 'https://aviation-safety.net/'+row.find('a')['href']
                
                #sleep(randint(1,3))
                new_page = requests.get(link_, headers=headers)
                new_soup = BeautifulSoup(new_page.text, 'lxml')
                table1 = new_soup.find('table')
                
                # make changes here!!!!!!!
                row_dict = {}
                for i in table1.find_all('tr'):
                    title = i.text
                    title, text = title.split(":", maxsplit=1)
                    row_dict[title] = text
                
                tl.append(row_dict)
                    
        df= pd.DataFrame(tl)
        df.to_csv(f'{year}_aviation-safety_new.csv', encoding='utf-8-sig', index=False)    
              
    
    if __name__ == "__main__":
    
        START = 2015
        STOP = 2016
    
        years = [year for year in range(START,STOP+1)]
    
        print(f'Scraping {len(years)} years of data')
    
        with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
            final_list = executor.map(scraping,years)