Search code examples
pythonbeautifulsoup

How to extract text from an element using bs4


I am scraping Airbnb (Link to the following page), and one of the things I want to get is since when is the host hosting, as shown in the picture below (marked with red pen):

image example

The code I am currently using to solve this is:

account_active_since = soup.find('li', class_='l7n4lsf atm_9s_1o8liyq_keqd55 dir dir-ltr').getText()

but with this code I get the following output:

3 guestsStudio1 bed1 bath

The HTML tag for it is:

<div class="s1l7gi0l atm_c8_km0zk7 atm_g3_18khvle atm_fr_1m9t47k atm_7l_1esdqks dir dir-ltr"><ol class="lgx66tx atm_gi_idpfg4 atm_l8_idpfg4 dir dir-ltr"><li class="l7n4lsf atm_9s_1o8liyq_keqd55 dir dir-ltr">3 years hosting</li></ol></div>

HTML

What am I doing wrong?

Thank you.

Here is the whole code. The part of the code that needs fixing is in line 77 (in the for loop):

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import time
import re

URL = "https://www.airbnb.com/s/San-Francisco--California--United-States/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&monthly_start_date=2024-06-01&monthly_length=3&monthly_end_date=2024-09-01&price_filter_input_type=0&channel=EXPLORE&query=San%20Francisco%2C%20California%2C%20United%20States&place_id=ChIJIQBpAG2ahYAR_6128GcTUEo&date_picker_type=calendar&checkin=2024-05-08&checkout=2024-05-22&source=structured_search_input_header&search_type=autocomplete_click"

driver = webdriver.Chrome()
driver.get(URL)
driver.maximize_window()

time.sleep(4)

try:
    accept_cookies = driver.find_element(By.XPATH, '//*[@id="react-application"]/div/div/div[1]/div/div[6]/section/div[2]/div[2]/button')
    accept_cookies.click()
except NoSuchElementException:
    print("No 'Accept cookies' found.")

elements = driver.find_elements(By.XPATH, "//div[@id='site-content']//div[@data-testid='card-container']//div[@class = ' dir dir-ltr']//a[1]")

while True:
    try:
        for element in elements:
            try:
                element.click()
            except StaleElementReferenceException:
                print("No more elements to click.")
                break

            # Switch to the newly opened tab
            driver.switch_to.window(driver.window_handles[-1])

            time.sleep(4)

            page_source = driver.page_source
            soup = BeautifulSoup(page_source, "html.parser")

            try:
                close_pop_up = driver.find_element(By.XPATH,
                                                   '/html/body/div[9]/div/div/section/div/div/div[2]/div/div[1]/button')
                close_pop_up.click()
            except NoSuchElementException:
                print("No pop up element found.")

            try:
                apartment_name = soup.find('h1', class_='hpipapi').getText()
            except AttributeError:
                apartment_name = "Name not specified"

            try:
                short_description = soup.find('h2', class_='hpipapi').getText()
            except AttributeError:
                short_description = "Description not specifed"

            try:
                rooms_bathrooms = soup.find('div', class_='o1kjrihn').getText()
            except AttributeError:
                rooms_bathrooms = "Utilities not specified"

            try:
                price_per_night = soup.find('span', class_='_1y74zjx').getText()
            except AttributeError:
                price_per_night = "Price not specified"

            try:
                host_name = soup.find('div', class_='cm0tib6').find('div', class_='t1pxe1a4').getText()
            except (AttributeError, IndexError):
                host_name = "Host name not specified"

            # This variable needs to be fixed
            try:
                account_active_since = soup.find('li', class_='l7n4lsf atm_9s_1o8liyq_keqd55 dir dir-ltr').getText()
            except AttributeError:
                account_active_since = "Active since not specified"

            try:
                guest_favourite_stars = soup.find('div', class_='a8jhwcl').getText()
                guest_favourite_stars = re.search(r'\d+\.\d+', guest_favourite_stars).group()
            except AttributeError:
                guest_favourite_stars = "Not guest favourite"

            try:
                guest_favourite_reviews = soup.find('div', class_='r16onr0j').getText()
            except AttributeError:
                guest_favourite_reviews = "Not guest favourite"

            print("Apartment Name:", apartment_name)
            print("Short Description:", short_description)
            print("Rooms and Bathrooms:", rooms_bathrooms)
            print("Price per Night:", price_per_night)
            print("Host Name:", host_name)
            print("Account Active Since:", account_active_since)
            print("Guest favourite stars:", guest_favourite_stars)
            print("Guest favourite reviews:", guest_favourite_reviews)

            time.sleep(1)

            driver.execute_script(f"window.scrollTo(0, 100);")

            driver.save_screenshot(f'screenshots/{apartment_name.replace(" ", "_")}.png')

            # Close the current tab
            driver.close()

            time.sleep(1)

            # Switch back to the main tab
            driver.switch_to.window(driver.window_handles[0])

    except NoSuchElementException:
        print("No more elements to click. Heading to the next page.")

    try:
        next_page = driver.find_element(By.XPATH, '//*[@id="site-content"]/div/div[3]/div/div/div/nav/div/a[5]')
        next_page.click()
        time.sleep(2)

    except NoSuchElementException:
        print("No more pages to click.")
        break

    elements = driver.find_elements(By.XPATH, "//div[@id='site-content']//div[@data-testid='card-container']//div[@class = ' dir dir-ltr']//a[1]")

driver.quit()

Solution

  • Here is one way of getting that information (no BeautifulSoup needed):

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument('disable-notifications')
    
    with webdriver.Chrome(options=chrome_options) as browser:
        wait = WebDriverWait(browser, 5)
    
        url = 'https://www.airbnb.com/rooms/46181703?check_in=2024-05-08&check_out=2024-05-22&source_impression_id=p3_1714848136_vsUX6ezsi0Z7N%2FSG&previous_page_section_name=1000&federated_search_id=efebfc50-8682-44ac-acee-e5d374cb3da4'
        browser.get(url) 
        hosting_since = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@data-section-id="HOST_OVERVIEW_DEFAULT"]//li[contains(text(), "hosting")]'))).text
        print(hosting_since)
    

    Result in terminal:

    3 years hosting
    

    Selenium documentation can be found here.