Search code examples
pythonweb-scrapingbeautifulsoup

Scrape the employee ratings from Indeed in Python


I am new to the web scrape and I need to scrape the employee ratings and reviews from Indeed but my code cannot work out. Could you please tell what wrong with my code? Thanks so much for your help.

from bs4 import BeautifulSoup
import pandas as pd
import requests

df = pd.DataFrame({'review_title': [],'review':[],'author':[],'rating':[]})

for i in range(0, 140, 20):
    url = (f'https://www.indeed.com/cmp/Ey/reviews?fcountry=IT&start={i}')
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
    page = requests.get(url, headers = header)
    soup = BeautifulSoup(page.content, 'lxml')
    results = soup.find("div", {"id" : 'cmp-container'})
    elems = results.find_all(class_='cmp-Review-container')
    for elem in elems:
            title = elem.find(attrs = {'class':'cmp-Review-title'})
            review = elem.find('div', {'class': 'cmp-Review-text'})
            author = elem.find(attrs = {'class':'cmp-Review-author'})
            rating = elem.find(attrs = {'class':'cmp-ReviewRating-text'})
            df = df.append({'review_title': title.text,
                 'review': review.text,
                 'author': author.text,
                 'rating': rating.text
                }, ignore_index=True)

It only returns the header.

After taking Parikh's suggestion, it can return the employee reviews but it does not show the employee status, the former one or the current one. How can I improve my code to have the employee status?

# Load the Modules
from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
import pandas as pd

# Use Big Tech as the samples to scrape the employee reviews on 12/20/2021

# Meta(Facebook), 
lst=[]
for i in range(0, 460, 20):
    print(i)
    url = (f'https://www.indeed.com/cmp/Meta-dd1502f2/reviews?start={i}')
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
    page = requests.get(url, headers = header)
    soup = BeautifulSoup(page.content, 'lxml')
    main_data = soup.find_all("div",attrs={"data-tn-section":"reviews"})
    for data in main_data:
        try:
            title=data.find("h2").get_text(strip=True)
        except AttributeError:
            title=np.nan
        try:
            author=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[1]
        except AttributeError:
            author=np.nan
        try:
            review=data.find("span",attrs={"itemprop":"reviewBody"}).get_text(strip=True)
        except AttributeError:
            review=np.nan
        try:
            rating=data.find("div",attrs={"itemprop":"reviewRating"}).find("button")['aria-label'].split(" ")[0]
        except AttributeError:
            rating=np.nan
        lst.append([title,author,review,rating])

df_meta=pd.DataFrame(data=lst,columns=['title','author','review','rating'])
df_meta

The output is shown below and I also want to have the employee status. Thanks so much for your help.

enter image description here

enter image description here

Thanks again for your kind help and time. My last question is that I tried to scrape the pros and cons but it only returns NA. How should I revise it?

import numpy as np
lst=[]
for i in range(0, 240, 20):
    print(i)
    url = (f'https://www.indeed.com/cmp/Airbnb/reviews?start={i}')
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
    page = requests.get(url, headers = header)
    soup = BeautifulSoup(page.content, 'lxml')
    main_data = soup.find_all("div",attrs={"data-tn-section":"reviews"})
    for data in main_data:
        
        try:
            title=data.find("h2").get_text(strip=True)
        except AttributeError:
            title=np.nan
            
        try:
            author=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[1]
        except AttributeError:
            author=np.nan
            
        try:
            status=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[0]
        except AttributeError:
            status=np.nan
            
    
        try:
            review=data.find("span",attrs={"itemprop":"reviewBody"}).get_text(strip=True)
        except AttributeError:
            review=np.nan
            
        try:
            pros=data.find('div',class_='cmp-review-pro-text')[0].getText(strip=True)            
        except:
            pros=np.nan
        try:
            cons=data.find('div',class_='cmp-review-con-text')[0].getText(strip=True)
        except:
            cons=np.nan
            
        try:
            rating=data.find("div",attrs={"itemprop":"reviewRating"}).find("button")['aria-label'].split(" ")[0]
        except AttributeError:
            rating=np.nan
            
        lst.append([title,author,status,pros,cons,review,rating])

enter image description here


Solution

  • See first print out main_data and get overview of in which tag data is being present and according to get particular data also i have added try and except block

    import numpy as np
    lst=[]
    for i in range(0, 140, 20):
        print(i)
        url = (f'https://www.indeed.com/cmp/Ey/reviews?fcountry=IT&start={i}')
        header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
        page = requests.get(url, headers = header)
        soup = BeautifulSoup(page.content, 'lxml')   
        main_data=results.find_all("div",attrs={"data-tn-section":"reviews" })
        for data in main_data:
            try:
                title=data.find("h2").get_text(strip=True)
            except AttributeError:
                title=np.nan
            try:
                author=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[1]
            except AttributeError:
                author=np.nan
            try:
               status=data.find("span",attrs={"itemprop":"author"}).get_text(strip=True).split("-")[0]
            except AttributeError:
               status=np.nan
    
            try:
                review=data.find("span",attrs={"itemprop":"reviewBody"}).get_text(strip=True)
            except AttributeError:
                review=np.nan
            try:
                rating=data.find("div",attrs={"itemprop":"reviewRating"}).find("button")['aria-label'].split(" ")[0]
           except AttributeError:
                rating=np.nan
            lst.append([title,author,status,review,rating])
    

    Now use lst as data inside DataFrame

    import pandas as pd
    df=pd.DataFrame(data=lst,columns=['title','author','status','review','rating'])
    df
    

    Output:

                  title            author              status    review rating
    0   good exerccise  Provincia di Milano, Lombardia  Senior Manager(Former Employee) working here can be challenging but helps buil...   3.0