python selenium-webdriver web-scraping beautifulsoup linkedin-api

HTML tags changes during web scraping LinkedIn using Selenium and BeautifulSoup

I am having a problem where I can't scrape education, experience section in LinkedIn profile using Selenium and BeautifulSoup.

For now, i have successfully scrape name, headline and the location. but for education and experience section, i notice that there is changes in the html tags when I open the inspect which trouble me to identify the sections and to extract using beautifulSoup. Does anyone have the solutions? here the example of the code:

experience = soup.find("section", {"id": "experience-section"}).find('ul')

print(experience)

li_tags = experience.find('div')
a_tags = li_tags.find("a")
job_title = a_tags.find("h3").get_text().strip()
 
print(job_title)
 
company_name = a_tags.find_all("p")[1].get_text().strip()
print(company_name)
 
joining_date = a_tags.find_all("h4")[0].find_all("span")[1].get_text().strip()
    employment_duration = a_tags.find_all("h4")[1].find_all("span")[1].get_text().strip()
 
print(joining_date + ", " + employment_duration)

here you can see the section id, where the number is changing

the inspect that i expect should be like this

Solution

You might find it helpful. The script below first logins to LinkedIn using mail and password, next it goes to the profile section by clicking on the profile avatar and finally get the page source of the profile to parse it using beautifulsoup.

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import ChromeOptions, Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

options = ChromeOptions()

# maximized and disable forbar
options.add_argument("--start-maximized")
options.add_experimental_option("useAutomationExtension", False)
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option(
    "prefs",
    {
        "credentials_enable_service": False,
        "profile.password_manager_enabled": False,
        "profile.default_content_setting_values.notifications": 2
        # with 2 should disable/block notifications and 1 to allow
    },
)

driver = webdriver.Chrome(options=options)

url = "https://www.linkedin.com/uas/login"
driver.get(url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID,"organic-div")))
container = driver.find_element(By.ID, "organic-div")

# login: fill the email account, password
email = container.find_element(By.ID, 'username')
password = container.find_element(By.ID, 'password')
email.send_keys("xxxxxxxxxxxxxxxx")
password.send_keys("xxxxxxxxxxxxxx")
password.send_keys(Keys.ENTER)
time.sleep(2)

WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "authentication-outlet")))
driver.find_element(By.CLASS_NAME, 'share-box-feed-entry__avatar').click()

time.sleep(2)

soup = BeautifulSoup(driver.page_source, 'lxml')

experience_div = soup.find('div', {"id": "experience"})
exp_list = experience_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild('ul').findAll('li')

experiences = []

for each_exp in exp_list:

    company_logo = each_exp.findNext('img').get('src')
    col = each_exp.findNext("div", {"class": "display-flex flex-column full-width"})
    profile_title = col.findNext('div').findNext('span').findNext('span').text
    company_name = col.findNext('span', {"class": "t-14 t-normal"}).findNext('span').text
    timeframe = col.findAll('span', {"class": "t-14 t-normal t-black--light"})[0].findNext('span').text
    location = col.findAll('span', {"class": "t-14 t-normal t-black--light"})[1].findNext('span').text

    experiences.append({
        "company_logo": company_logo,
        "profile_title": profile_title.replace('\n', '').strip(),
        "company_name": company_name.replace('\n', '').strip(),
        "timeframe": timeframe.replace('\n', '').strip(),
        "location": location.replace('\n', '').strip(),
    })

print(experiences)

you can parse the other sections like education, certifications, etc. in the same way as I did for the experience section.