Search code examples
pythonweb-scrapingbeautifulsouppython-requestsscreen-scraping

Not being able to extract all the requirements of each job


enter image description hereHi I am trying to do a web scraping using python, I want to extract the job title, company name, salary and skills required. I managed to get all the information I wanted except one thing which is skills required. Nir Elbaz helped me to partially solve this problem but for some reason I only get the first skill required for each job description. Here is my code as well as a screenshot of the results I got.

import pandas as pd 
import requests 
from bs4 import BeautifulSoup
import csv 
from itertools import zip_longest 
job_title = []
company_name = []
location = []
salary = []
links = []
skills = []
page_number = 0

while True: 
    result = requests.get(f"https://www.reed.co.uk/jobs/data-analyst-jobs?pageno={page_number}")
    src = result.content
    soup = BeautifulSoup(src, 'lxml')
    page_limit = soup.find('div', {"class":"page-counter"})
    Job_titles = soup.find_all('h3', {"class":"title"})
    Company_names = soup.find_all('a', {'class':"gtmJobListingPostedBy"})
    Locations = soup.find_all('li', {'class':"location"})
    Salary = soup.find_all('li',{"class": "salary"})
    #Job_skills = soup.find('strong').text
    #Job_skills = soup.find_all('div',{"class":"description"})
    

    for i in range(len(Job_titles)):
        
        job_title.append(Job_titles[i].text)
        links.append(Job_titles[i].find('a').attrs['href'])
        company_name.append(Company_names[i].text)
        location.append(Locations[i].text)
        salary.append(Salary[i].text)
    
    page_number += 1
    if(page_number >= 2):
        print("pages ended here")
        break 

for link in links: 
    url = "https://www.reed.co.uk/" + link
    result = requests.get(url)
    src = result.content 
    soup = BeautifulSoup(src, 'lxml')
    requirements = soup.find('span', {'itemprop':"description"}).find('ul')
    #print(requirements)
    skills_text = ""
    if not(requirements==None):
        for li in requirements.text.split(';'):
            #print(str(li))
            skills_text += str(li)
    #skills_text = skills_text[0:3]
    skills.append(skills_text.strip())

file_list = [job_title, company_name, location, salary, skills]
exported = zip_longest(*file_list)


Solution

  • Try to replace this section in your code with this:

    if not(requirements==None):
        for li in requirements.find('li'):
            print(str(li))
            skills_text += str(li) + ', '
    skills_text = skills_text[:-2]
    skills.append(skills_text)