trying to find out the logic that is behind this page:
we have stored some results in the following db:
from a to z approx: 120 results or more:
which Options do we have to get the data
https://www.raiffeisen.ch/zuerich/de.html#bankselector-focus-titlebar
Raiffeisenbank Zürich
Limmatquai 68
8001Zürich
Tel. +41 43 244 78 78
[email protected]
https://www.raiffeisen.ch/sennwald/de.html
Raiffeisenbank Sennwald
Äugstisriet 7
9466Sennwald
Tel. +41 81 750 40 40
[email protected]
BIC/Swift Code: RAIFCH22XXX
https://www.raiffeisen.ch/basel/de/ueber-uns/engagement.html#bankselector-focus-titlebar
Raiffeisenbank Basel
St. Jakobs-Strasse 7
4052Basel
Tel. +41 61 226 27 28
[email protected]
Hmm - i think that - if somehow all is encapsulated in the url-encoded block...
well i am trying to find it out - and here is my approach:
import requests
from bs4 import BeautifulSoup
def get_raiffeisen_data(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
banks = []
# Find all bank entries
bank_entries = soup.find_all('div', class_='bank-entry')
for entry in bank_entries:
bank = {}
bank['name'] = entry.find('h2', class_='bank-name').text.strip()
bank['address'] = entry.find('div', class_='bank-address').text.strip()
bank['tel'] = entry.find('div', class_='bank-tel').text.strip()
bank['email'] = entry.find('a', class_='bank-email').text.strip()
banks.append(bank)
return banks
else:
print(f"Failed to retrieve data from {url}")
return None
url = 'https://www.raiffeisen.ch/rch/de/ueber-uns/raiffeisen-gruppe/organisation/raiffeisenbanken/deutsche-schweiz.html'
banks_data = get_raiffeisen_data(url)
for bank in banks_data:
print(f"Name: {bank['name']}")
print(f"Address: {bank['address']}")
print(f"Tel: {bank['tel']}")
print(f"Email: {bank['email']}")
print('-' * 40)
You need to make a request for each bank, but some of the sites are hosted on raiffeisen.ch
while other's are redirected to a different site (eg: lokalbank.ch). These site have completely different structures, thus you need different logic/strategy for each type of site.
Synchronous method (slow):
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import json
def scrape_raiffeisen(html):
data = BeautifulSoup(html, 'html.parser').select_one('meta[name=organisationData]')
bank = {
'name': data.get('data-organisation-name'),
'address': {
'street': data.get('data-organisation-street'),
'zip': data.get('data-organisation-zip'),
'city': data.get('data-organisation-city')
},
'tel': data.get('data-organisation-phone'),
'email': data.get('data-organisation-mail')
}
return bank
def scrape_lokalbank(html):
script = BeautifulSoup(html, 'html.parser').find('script', type='application/ld+json')
data = json.loads(script.text).get('@graph')[1]
address = data.get('address', {})
bank = {
'name': data.get('name'),
'address': {
'street': address.get('streetAddress'),
'zip': address.get('postalCode'),
'city': address.get('addressLocality')
},
'tel': data.get('telephone'),
'email': data.get('email')
}
return bank
scrapers = {'www.raiffeisen.ch': scrape_raiffeisen, 'www.lokalbank.ch': scrape_lokalbank}
url = 'https://www.raiffeisen.ch/rch/de/ueber-uns/raiffeisen-gruppe/organisation/raiffeisenbanken/deutsche-schweiz.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
bank_links = [a.get('href') for a in soup.select('div.accordion__itemWrapper li > a')]
banks = []
for link in bank_links:
response = requests.get(link)
hostname = urlparse(response.url).netloc
scraper = scrapers.get(hostname)
if not scraper:
print(f'Could not find scraper for `{response.url}`')
continue
banks.append(scraper(response.text))
print(banks)
Asynchronous method:
For faster scraping, we can make the requests asynchronously using a library like aiohttp or httpx. I used curl_cffi:
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import json
import asyncio
from curl_cffi.requests import AsyncSession
def scrape_raiffeisen(html):
data = BeautifulSoup(html, 'html.parser').select_one('meta[name=organisationData]')
bank = {
'name': data.get('data-organisation-name'),
'address': {
'street': data.get('data-organisation-street'),
'zip': data.get('data-organisation-zip'),
'city': data.get('data-organisation-city')
},
'tel': data.get('data-organisation-phone'),
'email': data.get('data-organisation-mail')
}
return bank
def scrape_lokalbank(html):
script = BeautifulSoup(html, 'html.parser').find('script', type='application/ld+json')
data = json.loads(script.text).get('@graph')[1]
address = data.get('address', {})
bank = {
'name': data.get('name'),
'address': {
'street': address.get('streetAddress'),
'zip': address.get('postalCode'),
'city': address.get('addressLocality')
},
'tel': data.get('telephone'),
'email': data.get('email')
}
return bank
scrapers = {'www.raiffeisen.ch': scrape_raiffeisen, 'www.lokalbank.ch': scrape_lokalbank}
def get_banks():
session = AsyncSession()
async def scrape_bank(link):
response = await session.get(link)
hostname = urlparse(response.url).netloc
scraper = scrapers.get(hostname)
if not scraper:
print(f'Could not find scraper for `{response.url}`')
return
return scraper(response.text)
async def main():
url = 'https://www.raiffeisen.ch/rch/de/ueber-uns/raiffeisen-gruppe/organisation/raiffeisenbanken/deutsche-schweiz.html'
response = await session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
bank_links = [a.get('href') for a in soup.select('div.accordion__itemWrapper li > a')]
tasks = [scrape_bank(link) for link in bank_links]
banks = await asyncio.gather(*tasks)
await session.close()
return banks
return asyncio.run(main())
banks = get_banks()
print(banks)