Search code examples
pythonselenium-webdriverweb-scrapingbeautifulsouplinkedin-api

HTML tags changes during web scraping LinkedIn using Selenium and BeautifulSoup


I am having a problem where I can't scrape education, experience section in LinkedIn profile using Selenium and BeautifulSoup.

For now, i have successfully scrape name, headline and the location. but for education and experience section, i notice that there is changes in the html tags when I open the inspect which trouble me to identify the sections and to extract using beautifulSoup. Does anyone have the solutions? here the example of the code:

experience = soup.find("section", {"id": "experience-section"}).find('ul')

print(experience)

li_tags = experience.find('div')
a_tags = li_tags.find("a")
job_title = a_tags.find("h3").get_text().strip()
 
print(job_title)
 
company_name = a_tags.find_all("p")[1].get_text().strip()
print(company_name)
 
joining_date = a_tags.find_all("h4")[0].find_all("span")[1].get_text().strip()
    employment_duration = a_tags.find_all("h4")[1].find_all("span")[1].get_text().strip()
 
print(joining_date + ", " + employment_duration)

here you can see the section id, where the number is changing

the inspect that i expect should be like this


Solution

  • You might find it helpful. The script below first logins to LinkedIn using mail and password, next it goes to the profile section by clicking on the profile avatar and finally get the page source of the profile to parse it using beautifulsoup.

    import time
    from bs4 import BeautifulSoup
    from selenium import webdriver
    from selenium.webdriver import ChromeOptions, Keys
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    
    options = ChromeOptions()
    
    # maximized and disable forbar
    options.add_argument("--start-maximized")
    options.add_experimental_option("useAutomationExtension", False)
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option(
        "prefs",
        {
            "credentials_enable_service": False,
            "profile.password_manager_enabled": False,
            "profile.default_content_setting_values.notifications": 2
            # with 2 should disable/block notifications and 1 to allow
        },
    )
    
    driver = webdriver.Chrome(options=options)
    
    url = "https://www.linkedin.com/uas/login"
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID,"organic-div")))
    container = driver.find_element(By.ID, "organic-div")
    
    # login: fill the email account, password
    email = container.find_element(By.ID, 'username')
    password = container.find_element(By.ID, 'password')
    email.send_keys("xxxxxxxxxxxxxxxx")
    password.send_keys("xxxxxxxxxxxxxx")
    password.send_keys(Keys.ENTER)
    time.sleep(2)
    
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "authentication-outlet")))
    driver.find_element(By.CLASS_NAME, 'share-box-feed-entry__avatar').click()
    
    time.sleep(2)
    
    soup = BeautifulSoup(driver.page_source, 'lxml')
    
    experience_div = soup.find('div', {"id": "experience"})
    exp_list = experience_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild('ul').findAll('li')
    
    experiences = []
    
    for each_exp in exp_list:
    
        company_logo = each_exp.findNext('img').get('src')
        col = each_exp.findNext("div", {"class": "display-flex flex-column full-width"})
        profile_title = col.findNext('div').findNext('span').findNext('span').text
        company_name = col.findNext('span', {"class": "t-14 t-normal"}).findNext('span').text
        timeframe = col.findAll('span', {"class": "t-14 t-normal t-black--light"})[0].findNext('span').text
        location = col.findAll('span', {"class": "t-14 t-normal t-black--light"})[1].findNext('span').text
    
        experiences.append({
            "company_logo": company_logo,
            "profile_title": profile_title.replace('\n', '').strip(),
            "company_name": company_name.replace('\n', '').strip(),
            "timeframe": timeframe.replace('\n', '').strip(),
            "location": location.replace('\n', '').strip(),
        })
    
    print(experiences)
    

    you can parse the other sections like education, certifications, etc. in the same way as I did for the experience section.