Search code examples
pythonweb-scrapingurllib

a data collection with web scraping


I'am trying to extract data from a site and then to create a DataFrame out of it. the program doesnt work properly. I'am new in web scraping. Hope somoene help me out and find the problem.

from urllib.request import urlopen
from bs4 import BeautifulSoup

url = 'https://www.imdb.com/chart/top/?sort=rk,asc&mode=simple&page=1'

page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')

#print(soup)

film_in= soup.find('tbody').findAll('tr')

#print(film_in)
film = film_in[0]
#print(film)


titre = film.find("a",{'title':'Frank Darabont (dir.), Tim Robbins, Morgan Freeman'})
print(titre.text)


rang = film.find("td",{'class':'ratingColumn imdbRating'}).find('strong').text
#print(rang)

def remove_parentheses(string):
    return string.replace("(","").replace(")","")


année = film.find("span",{'class':'secondaryInfo'}).text
#print(année)

imdb =[]

for films in film_in:
    titre = film.find("a",{'title':'Frank Darabont (dir.), Tim Robbins, Morgan Freeman'})

    rang = film.find("td",{'class':'ratingColumn imdbRating'}).find('strong').text

    année =(remove_parentheses(film.find("span",{'class':'secondaryInfo'}).text))

    dictionnaire = {'film': film,
                    'rang': rang,
                    'année':année
                    }
    imdb.append(dictionnaire)

df_imdb = pd.DataFrame(imdb)
print(df_imdb)

I'am trying to extract data from a site and then to create a DataFrame out of it. the program doesnt work properly. I need to solve it using urllib, is there a way. thanks in advance I'am new in web scraping.


Solution

  • You can try the next example:

        from bs4 import BeautifulSoup
        from urllib.request import urlopen
        import requests
        import pandas as pd
        
        url = 'https://www.imdb.com/chart/top/?sort=rk,asc&mode=simple&page=1'
        
        #soup = BeautifulSoup(requests.get(url).text,'html.parser')# It's the perfect and powerful 
        page = urlopen(url)
        soup = BeautifulSoup(page, 'html.parser')
        
        imdb = []
        film_in = soup.select('table[class="chart full-width"] tr')
        for film in film_in[1:]:
            titre = film.select_one('.titleColumn a').get_text(strip=True)
            rang = film.select_one('[class="ratingColumn imdbRating"] > strong').text
        
            année =film.find("span",{'class':'secondaryInfo'}).get_text(strip=True)
        
            dictionnaire = {'titre': titre,
                            'rang': rang,
                            'année':année
                            }
            imdb.append(dictionnaire)
        
        df_imdb = pd.DataFrame(imdb)
        print(df_imdb)
    

    Output:

                            titre rang   année
    0    The Shawshank Redemption  9.2  (1994)
    1               The Godfather  9.2  (1972)
    2             The Dark Knight  9.0  (2008)
    3       The Godfather Part II  9.0  (1974)
    4                12 Angry Men  9.0  (1957)
    ..                        ...  ...     ...
    245               Dersu Uzala  8.0  (1975)
    246                   Aladdin  8.0  (1992)
    247                  The Help  8.0  (2011)
    248            The Iron Giant  8.0  (1999)
    249                    Gandhi  8.0  (1982)
    
    [250 rows x 3 columns]