Search code examples
pythonemailhrefscreen-scraping

Python scraping email address from href link


I want to get all email adresses from these schools (green links): http://www.schulliste.eu/schule/

right now I have the code for getting all href links but how can I click on every link and scrap the email address from every clicked link?

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import requests

def getLinks(url):
    html_page = urlopen(url)
    soup = BeautifulSoup(html_page)
    links = []

    for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
        links.append(link.get('href',))

    return links

print(getLinks("http://www.schulliste.eu/schule/"))

Solution

  • You can find all the links to each school, and then run a request on each:

    import requests
    from bs4 import BeautifulSoup as soup
    def get_emails(_links:list, _r = [0, 10]):
      for i in range(*_r):
         new_d = soup(requests.get(_links[i]).text, 'html.parser').find_all('a', {'class':'my_modal_open'})
         if new_d:
           yield new_d[-1]['title']
    
    d = soup(requests.get('http://www.schulliste.eu/schule/').text, 'html.parser')
    results = [i['href'] for i in d.find_all('a')][52:-9]
    print(list(get_emails(results)))
    

    Output:

    ['[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]']