Search code examples
pythonsslbeautifulsoupurllib

How can I iterate functions associated with web data?


Find the link at position 3 (the first name is 1). Follow that link. Repeat this process 4 times. The answer is the last name that you retrieve. Sequence of names: Fikret Montgomery Mhairade Butchi Anayah

My code:

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

count=0
conec=list()
conec2=list()
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode=ssl.CERT_NONE


class linker():
    def conectar(self,n,u):
        url = u
        html = urllib.request.urlopen(url, context=ctx).read()
        soup = BeautifulSoup (html,'html.parser')
        tags = soup('a')
        for tag in tags:
            link=tag.get('href', None)
            conec.append(link)
        new_link=conec[n-1]
        return new_link
    
    
    def new_page(self,n,u):
        url= u
        html = urllib.request.urlopen(url, context=ctx).read()
        soup = BeautifulSoup (html,'html.parser')
        tags = soup('a')
        for tag in tags:
            link=tag.get('href', None)
            conec2.append(link)
            new_link2=conec2[n-1]
        return new_link2

  count=int(input('Enter count: '))
  n=int(input('Enter position: '))
  x=linker()
  while count!=0:
        u='http://py4e-data.dr-chuck.net/known_by_Fikret.html'
        u=x.conectar(n,u)
        a=x.new_page(n,u)
        count-=1
  print(a)

Output:

Enter count: 4
Enter position: 3
http://py4e-data.dr-chuck.net/known_by_Mhairade.html

Only shows until Mhairade, I tried a bunch of things to obtain the others but not successfully.


Solution

  • Here is a Recursive solution using requests.

    import requests
    from bs4 import BeautifulSoup
    
    url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
    def solve(url, count, position):
        # Exit Condition
        if count <= 0:
            return
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'lxml')
    
        # Scraping the Link and Name
        u = soup.find('ul').select_one(f'li:nth-of-type({position})').find('a')
        t_url, name = u['href'], u.text
        print(f'Link: {t_url}\nName: {name}\n')
    
        # Calling the same function with the above scraped URL
        solve(t_url, count-1, position)
    
    
    count = int(input('Enter count: '))
    position = int(input('Enter Position: '))
    
    solve(url, count, position)
    
    Enter count: 4
    Enter Position: 3
    
    Link: http://py4e-data.dr-chuck.net/known_by_Montgomery.html
    Name: Montgomery
    
    Link: http://py4e-data.dr-chuck.net/known_by_Mhairade.html
    Name: Mhairade
    
    Link: http://py4e-data.dr-chuck.net/known_by_Butchi.html
    Name: Butchi
    
    Link: http://py4e-data.dr-chuck.net/known_by_Anayah.html
    Name: Anayah