I am scraping Airbnb (Link to the following page), and one of the things I want to get is since when is the host hosting, as shown in the picture below (marked with red pen):
The code I am currently using to solve this is:
account_active_since = soup.find('li', class_='l7n4lsf atm_9s_1o8liyq_keqd55 dir dir-ltr').getText()
but with this code I get the following output:
3 guestsStudio1 bed1 bath
The HTML tag for it is:
<div class="s1l7gi0l atm_c8_km0zk7 atm_g3_18khvle atm_fr_1m9t47k atm_7l_1esdqks dir dir-ltr"><ol class="lgx66tx atm_gi_idpfg4 atm_l8_idpfg4 dir dir-ltr"><li class="l7n4lsf atm_9s_1o8liyq_keqd55 dir dir-ltr">3 years hosting</li></ol></div>
What am I doing wrong?
Thank you.
Here is the whole code. The part of the code that needs fixing is in line 77 (in the for loop):
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import time
import re
URL = "https://www.airbnb.com/s/San-Francisco--California--United-States/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&monthly_start_date=2024-06-01&monthly_length=3&monthly_end_date=2024-09-01&price_filter_input_type=0&channel=EXPLORE&query=San%20Francisco%2C%20California%2C%20United%20States&place_id=ChIJIQBpAG2ahYAR_6128GcTUEo&date_picker_type=calendar&checkin=2024-05-08&checkout=2024-05-22&source=structured_search_input_header&search_type=autocomplete_click"
driver = webdriver.Chrome()
driver.get(URL)
driver.maximize_window()
time.sleep(4)
try:
accept_cookies = driver.find_element(By.XPATH, '//*[@id="react-application"]/div/div/div[1]/div/div[6]/section/div[2]/div[2]/button')
accept_cookies.click()
except NoSuchElementException:
print("No 'Accept cookies' found.")
elements = driver.find_elements(By.XPATH, "//div[@id='site-content']//div[@data-testid='card-container']//div[@class = ' dir dir-ltr']//a[1]")
while True:
try:
for element in elements:
try:
element.click()
except StaleElementReferenceException:
print("No more elements to click.")
break
# Switch to the newly opened tab
driver.switch_to.window(driver.window_handles[-1])
time.sleep(4)
page_source = driver.page_source
soup = BeautifulSoup(page_source, "html.parser")
try:
close_pop_up = driver.find_element(By.XPATH,
'/html/body/div[9]/div/div/section/div/div/div[2]/div/div[1]/button')
close_pop_up.click()
except NoSuchElementException:
print("No pop up element found.")
try:
apartment_name = soup.find('h1', class_='hpipapi').getText()
except AttributeError:
apartment_name = "Name not specified"
try:
short_description = soup.find('h2', class_='hpipapi').getText()
except AttributeError:
short_description = "Description not specifed"
try:
rooms_bathrooms = soup.find('div', class_='o1kjrihn').getText()
except AttributeError:
rooms_bathrooms = "Utilities not specified"
try:
price_per_night = soup.find('span', class_='_1y74zjx').getText()
except AttributeError:
price_per_night = "Price not specified"
try:
host_name = soup.find('div', class_='cm0tib6').find('div', class_='t1pxe1a4').getText()
except (AttributeError, IndexError):
host_name = "Host name not specified"
# This variable needs to be fixed
try:
account_active_since = soup.find('li', class_='l7n4lsf atm_9s_1o8liyq_keqd55 dir dir-ltr').getText()
except AttributeError:
account_active_since = "Active since not specified"
try:
guest_favourite_stars = soup.find('div', class_='a8jhwcl').getText()
guest_favourite_stars = re.search(r'\d+\.\d+', guest_favourite_stars).group()
except AttributeError:
guest_favourite_stars = "Not guest favourite"
try:
guest_favourite_reviews = soup.find('div', class_='r16onr0j').getText()
except AttributeError:
guest_favourite_reviews = "Not guest favourite"
print("Apartment Name:", apartment_name)
print("Short Description:", short_description)
print("Rooms and Bathrooms:", rooms_bathrooms)
print("Price per Night:", price_per_night)
print("Host Name:", host_name)
print("Account Active Since:", account_active_since)
print("Guest favourite stars:", guest_favourite_stars)
print("Guest favourite reviews:", guest_favourite_reviews)
time.sleep(1)
driver.execute_script(f"window.scrollTo(0, 100);")
driver.save_screenshot(f'screenshots/{apartment_name.replace(" ", "_")}.png')
# Close the current tab
driver.close()
time.sleep(1)
# Switch back to the main tab
driver.switch_to.window(driver.window_handles[0])
except NoSuchElementException:
print("No more elements to click. Heading to the next page.")
try:
next_page = driver.find_element(By.XPATH, '//*[@id="site-content"]/div/div[3]/div/div/div/nav/div/a[5]')
next_page.click()
time.sleep(2)
except NoSuchElementException:
print("No more pages to click.")
break
elements = driver.find_elements(By.XPATH, "//div[@id='site-content']//div[@data-testid='card-container']//div[@class = ' dir dir-ltr']//a[1]")
driver.quit()
Here is one way of getting that information (no BeautifulSoup needed):
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
with webdriver.Chrome(options=chrome_options) as browser:
wait = WebDriverWait(browser, 5)
url = 'https://www.airbnb.com/rooms/46181703?check_in=2024-05-08&check_out=2024-05-22&source_impression_id=p3_1714848136_vsUX6ezsi0Z7N%2FSG&previous_page_section_name=1000&federated_search_id=efebfc50-8682-44ac-acee-e5d374cb3da4'
browser.get(url)
hosting_since = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@data-section-id="HOST_OVERVIEW_DEFAULT"]//li[contains(text(), "hosting")]'))).text
print(hosting_since)
Result in terminal:
3 years hosting
Selenium documentation can be found here.