I am having a problem where I can't scrape education, experience section in LinkedIn profile using Selenium and BeautifulSoup.
For now, i have successfully scrape name, headline and the location. but for education and experience section, i notice that there is changes in the html tags when I open the inspect which trouble me to identify the sections and to extract using beautifulSoup. Does anyone have the solutions? here the example of the code:
experience = soup.find("section", {"id": "experience-section"}).find('ul')
print(experience)
li_tags = experience.find('div')
a_tags = li_tags.find("a")
job_title = a_tags.find("h3").get_text().strip()
print(job_title)
company_name = a_tags.find_all("p")[1].get_text().strip()
print(company_name)
joining_date = a_tags.find_all("h4")[0].find_all("span")[1].get_text().strip()
employment_duration = a_tags.find_all("h4")[1].find_all("span")[1].get_text().strip()
print(joining_date + ", " + employment_duration)
here you can see the section id, where the number is changing
You might find it helpful. The script below first logins to LinkedIn using mail and password, next it goes to the profile section by clicking on the profile avatar and finally get the page source of the profile to parse it using beautifulsoup.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import ChromeOptions, Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
options = ChromeOptions()
# maximized and disable forbar
options.add_argument("--start-maximized")
options.add_experimental_option("useAutomationExtension", False)
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option(
"prefs",
{
"credentials_enable_service": False,
"profile.password_manager_enabled": False,
"profile.default_content_setting_values.notifications": 2
# with 2 should disable/block notifications and 1 to allow
},
)
driver = webdriver.Chrome(options=options)
url = "https://www.linkedin.com/uas/login"
driver.get(url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID,"organic-div")))
container = driver.find_element(By.ID, "organic-div")
# login: fill the email account, password
email = container.find_element(By.ID, 'username')
password = container.find_element(By.ID, 'password')
email.send_keys("xxxxxxxxxxxxxxxx")
password.send_keys("xxxxxxxxxxxxxx")
password.send_keys(Keys.ENTER)
time.sleep(2)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "authentication-outlet")))
driver.find_element(By.CLASS_NAME, 'share-box-feed-entry__avatar').click()
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'lxml')
experience_div = soup.find('div', {"id": "experience"})
exp_list = experience_div.findNext('div').findNext('div', {"class": "pvs-list__outer-container"}).findChild('ul').findAll('li')
experiences = []
for each_exp in exp_list:
company_logo = each_exp.findNext('img').get('src')
col = each_exp.findNext("div", {"class": "display-flex flex-column full-width"})
profile_title = col.findNext('div').findNext('span').findNext('span').text
company_name = col.findNext('span', {"class": "t-14 t-normal"}).findNext('span').text
timeframe = col.findAll('span', {"class": "t-14 t-normal t-black--light"})[0].findNext('span').text
location = col.findAll('span', {"class": "t-14 t-normal t-black--light"})[1].findNext('span').text
experiences.append({
"company_logo": company_logo,
"profile_title": profile_title.replace('\n', '').strip(),
"company_name": company_name.replace('\n', '').strip(),
"timeframe": timeframe.replace('\n', '').strip(),
"location": location.replace('\n', '').strip(),
})
print(experiences)
you can parse the other sections like education, certifications, etc. in the same way as I did for the experience section.