Search code examples
pythonweb-scrapingbeautifulsoupxml-parsinghtml-parsing

How to scrape multiple webpages without overwriting the results?


New to scraping and trying to scrape multiple webpages from Transfermarkt without overwriting the previous one.

Know that this question has been asked previously but I can't get it to work for this case.

from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd
import itertools

headers = {'User-Agent' : 'Mozilla/5.0'}
df_headers = ['position_number' , 'position_description' , 'name' , 'dob' , 'nationality' , 'height' , 'foot' , 'joined' , 'signed_from' , 'contract_until']
urls = ['https://www.transfermarkt.com/fc-bayern-munich-u17/kader/verein/21058/saison_id/2018/plus/1', 'https://www.transfermarkt.com/fc-hennef-05-u17/kader/verein/48776/saison_id/2018/plus/1']

for url in urls:
    r = requests.get(url,  headers = headers)
    soup = bs(r.content, 'html.parser')


    position_number = [item.text for item in soup.select('.items .rn_nummer')]
    position_description = [item.text for item in soup.select('.items td:not([class])')]
    name = [item.text for item in soup.select('.hide-for-small .spielprofil_tooltip')]
    dob = [item.text for item in soup.select('.zentriert:nth-of-type(3):not([id])')]
    nationality = ['/'.join([i['title'] for i in item.select('[title]')]) for item in soup.select('.zentriert:nth-of-type(4):not([id])')]
    height = [item.text for item in soup.select('.zentriert:nth-of-type(5):not([id])')]
    foot = [item.text for item in soup.select('.zentriert:nth-of-type(6):not([id])')]
    joined = [item.text for item in soup.select('.zentriert:nth-of-type(7):not([id])')]
    signed_from = ['/'.join([item.find('img')['title'].lstrip(': '), item.find('img')['alt']]) if item.find('a') else ''
                   for item in soup.select('.zentriert:nth-of-type(8):not([id])')]
    contract_until = [item.text for item in soup.select('.zentriert:nth-of-type(9):not([id])')]

df = pd.DataFrame(list(zip(position_number, position_description, name, dob, nationality, height, foot, joined, signed_from, contract_until)), columns = df_headers)
print(df)

df.to_csv(r'Uljanas-MacBook-Air-2:~ uljanadufour$\bayern-munich123.csv')

It would also be helpful to be able to differentiate between the webpages once scraped.

Any help would be much appreciated.


Solution

  • Your code above scrapes data for each URL, parses it without putting it in a dataframe, and then moves on to the next URL. Since your call to pd.DataFrame() occurs outside the loop, you are constructing a dataframe of page data from the very last URL in urls.

    You need to create a dataframe outside of your for-loop, and then append incoming data for each URL to this dataframe.

    from bs4 import BeautifulSoup as bs
    import requests
    import re
    import pandas as pd
    import itertools
    
    headers = {'User-Agent' : 'Mozilla/5.0'}
    df_headers = ['position_number' , 'position_description' , 'name' , 'dob' , 'nationality' , 'height' , 'foot' , 'joined' , 'signed_from' , 'contract_until']
    urls = ['https://www.transfermarkt.com/fc-bayern-munich-u17/kader/verein/21058/saison_id/2018/plus/1', 'https://www.transfermarkt.com/fc-hennef-05-u17/kader/verein/48776/saison_id/2018/plus/1']
    
    #### Add this before for-loop. ####
    # Create empty dataframe with expected column names.
    df_full = pd.DataFrame(columns = df_headers)
    
    for url in urls:
        r = requests.get(url,  headers = headers)
        soup = bs(r.content, 'html.parser')
    
    
        position_number = [item.text for item in soup.select('.items .rn_nummer')]
        position_description = [item.text for item in soup.select('.items td:not([class])')]
        name = [item.text for item in soup.select('.hide-for-small .spielprofil_tooltip')]
        dob = [item.text for item in soup.select('.zentriert:nth-of-type(3):not([id])')]
        nationality = ['/'.join([i['title'] for i in item.select('[title]')]) for item in soup.select('.zentriert:nth-of-type(4):not([id])')]
        height = [item.text for item in soup.select('.zentriert:nth-of-type(5):not([id])')]
        foot = [item.text for item in soup.select('.zentriert:nth-of-type(6):not([id])')]
        joined = [item.text for item in soup.select('.zentriert:nth-of-type(7):not([id])')]
        signed_from = ['/'.join([item.find('img')['title'].lstrip(': '), item.find('img')['alt']]) if item.find('a') else ''
                       for item in soup.select('.zentriert:nth-of-type(8):not([id])')]
        contract_until = [item.text for item in soup.select('.zentriert:nth-of-type(9):not([id])')]
    
    
        #### Add this to for-loop. ####
    
        # Create a dataframe for page data.
        df = pd.DataFrame(list(zip(position_number, position_description, name, dob, nationality, height, foot, joined, signed_from, contract_until)), columns = df_headers)
    
        # Add page URL to index of page data.
        df.index = [url] * len(df)
    
        # Append page data to full data.
        df_full = df_full.append(df)
    
    print(df_full)