Search code examples
pythonpandasweb-scrapingbeautifulsoup

trying to find out the logic of this page: approx ++ 100 results stored - and parsed with Python & BS4


trying to find out the logic that is behind this page:

we have stored some results in the following db:

https://www.raiffeisen.ch/rch/de/ueber-uns/raiffeisen-gruppe/organisation/raiffeisenbanken/deutsche-schweiz.html#accordionitem_18104049731620873397

from a to z approx: 120 results or more:

which Options do we have to get the data

https://www.raiffeisen.ch/zuerich/de.html#bankselector-focus-titlebar

Raiffeisenbank Zürich
Limmatquai 68
8001Zürich
Tel. +41 43 244 78 78
[email protected]

https://www.raiffeisen.ch/sennwald/de.html

Raiffeisenbank Sennwald
Äugstisriet 7
9466Sennwald
Tel. +41 81 750 40 40
[email protected]
BIC/Swift Code: RAIFCH22XXX

https://www.raiffeisen.ch/basel/de/ueber-uns/engagement.html#bankselector-focus-titlebar

Raiffeisenbank Basel
St. Jakobs-Strasse 7
4052Basel
Tel. +41 61 226 27 28
[email protected]

Hmm - i think that - if somehow all is encapsulated in the url-encoded block...

well i am trying to find it out - and here is my approach:

import requests
from bs4 import BeautifulSoup

def get_raiffeisen_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        banks = []

        # Find all bank entries
        bank_entries = soup.find_all('div', class_='bank-entry')

        for entry in bank_entries:
            bank = {}
            bank['name'] = entry.find('h2', class_='bank-name').text.strip()
            bank['address'] = entry.find('div', class_='bank-address').text.strip()
            bank['tel'] = entry.find('div', class_='bank-tel').text.strip()
            bank['email'] = entry.find('a', class_='bank-email').text.strip()
            banks.append(bank)
        
        return banks
    else:
        print(f"Failed to retrieve data from {url}")
        return None

url = 'https://www.raiffeisen.ch/rch/de/ueber-uns/raiffeisen-gruppe/organisation/raiffeisenbanken/deutsche-schweiz.html'
banks_data = get_raiffeisen_data(url)

for bank in banks_data:
    print(f"Name: {bank['name']}")
    print(f"Address: {bank['address']}")
    print(f"Tel: {bank['tel']}")
    print(f"Email: {bank['email']}")
    print('-' * 40)

Solution

  • You need to make a request for each bank, but some of the sites are hosted on raiffeisen.ch while other's are redirected to a different site (eg: lokalbank.ch). These site have completely different structures, thus you need different logic/strategy for each type of site.

    Synchronous method (slow):

    import requests
    from bs4 import BeautifulSoup
    from urllib.parse import urlparse
    import json
    
    
    def scrape_raiffeisen(html):
        data = BeautifulSoup(html, 'html.parser').select_one('meta[name=organisationData]')
        bank = {
            'name': data.get('data-organisation-name'), 
            'address': {
                'street': data.get('data-organisation-street'),
                'zip': data.get('data-organisation-zip'),
                'city': data.get('data-organisation-city')
            },
            'tel': data.get('data-organisation-phone'),
            'email': data.get('data-organisation-mail')
        }
    
        return bank
    
    
    def scrape_lokalbank(html):
        script = BeautifulSoup(html, 'html.parser').find('script', type='application/ld+json')
        data = json.loads(script.text).get('@graph')[1]
        address = data.get('address', {})
    
        bank = {
            'name': data.get('name'), 
            'address': {
                'street': address.get('streetAddress'),
                'zip': address.get('postalCode'),
                'city': address.get('addressLocality')
            },
            'tel': data.get('telephone'),
            'email': data.get('email')
        }
    
        return bank
    
    scrapers = {'www.raiffeisen.ch': scrape_raiffeisen, 'www.lokalbank.ch': scrape_lokalbank}
    
    
    url = 'https://www.raiffeisen.ch/rch/de/ueber-uns/raiffeisen-gruppe/organisation/raiffeisenbanken/deutsche-schweiz.html'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    bank_links = [a.get('href') for a in soup.select('div.accordion__itemWrapper li > a')]
    
    banks = []
    for link in bank_links:
        response = requests.get(link)
        hostname = urlparse(response.url).netloc
        scraper = scrapers.get(hostname)
    
        if not scraper:
            print(f'Could not find scraper for `{response.url}`')
            continue
    
        banks.append(scraper(response.text))
    
    
    print(banks)
    

    Asynchronous method:

    For faster scraping, we can make the requests asynchronously using a library like aiohttp or httpx. I used curl_cffi:

    from bs4 import BeautifulSoup
    from urllib.parse import urlparse
    import json
    import asyncio
    from curl_cffi.requests import AsyncSession
    
    
    def scrape_raiffeisen(html):
        data = BeautifulSoup(html, 'html.parser').select_one('meta[name=organisationData]')
        bank = {
            'name': data.get('data-organisation-name'), 
            'address': {
                'street': data.get('data-organisation-street'),
                'zip': data.get('data-organisation-zip'),
                'city': data.get('data-organisation-city')
            },
            'tel': data.get('data-organisation-phone'),
            'email': data.get('data-organisation-mail')
        }
    
        return bank
    
    
    def scrape_lokalbank(html):
        script = BeautifulSoup(html, 'html.parser').find('script', type='application/ld+json')
        data = json.loads(script.text).get('@graph')[1]
        address = data.get('address', {})
    
        bank = {
            'name': data.get('name'), 
            'address': {
                'street': address.get('streetAddress'),
                'zip': address.get('postalCode'),
                'city': address.get('addressLocality')
            },
            'tel': data.get('telephone'),
            'email': data.get('email')
        }
    
        return bank
    
    scrapers = {'www.raiffeisen.ch': scrape_raiffeisen, 'www.lokalbank.ch': scrape_lokalbank}
    
    
    
    def get_banks():
        session = AsyncSession()
    
        async def scrape_bank(link):
            response = await session.get(link)
            hostname = urlparse(response.url).netloc
            scraper = scrapers.get(hostname)
    
            if not scraper:
                print(f'Could not find scraper for `{response.url}`')
                return
    
            return scraper(response.text)
    
        
        async def main():
            url = 'https://www.raiffeisen.ch/rch/de/ueber-uns/raiffeisen-gruppe/organisation/raiffeisenbanken/deutsche-schweiz.html'
            response = await session.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            bank_links = [a.get('href') for a in soup.select('div.accordion__itemWrapper li > a')]
    
            tasks = [scrape_bank(link) for link in bank_links]
            banks = await asyncio.gather(*tasks)
            await session.close()
            return banks
        
        return asyncio.run(main())
    
    
    banks = get_banks()
    print(banks)