python selenium web-scraping infinite-scroll

Scraping an Infinite Scroll Page

I am trying to webscrape, using python, a rental houses/apartment site pretty famous here in Brazil (5 andar).

I need to enter in each of the elements and scrape some information inside it. Any tips on how to do it? Since its an infinite scroll type of page?

OBS: By now I can already enter each element and scrape the data. my only problem is keep scrolling/ scraping the new data.

Here is the link for the site: https://www.quintoandar.com.br/alugar/imovel/sao-paulo-sp-brasil and a img of it

Here is what I have so far. It is already working on the first items

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import numpy as np
import pandas as pd

#Initializing the webdriver
options = webdriver.ChromeOptions()

#Change the path to where chromedriver is in your home folder.
path = 'chromedriver'
driver = webdriver.Chrome(executable_path=path, options=options)
driver.set_window_size(1600, 1024)

url = 'https://www.quintoandar.com.br/alugar/imovel/sao-paulo-sp-brasil'
driver.get(url)
time.sleep(5)

num_houses = 40
houses=[]

#Fix (scrolling the page a few items and going back to initial)
aux = driver.find_elements_by_xpath("//div[@class='sc-1qwl1yl-0 igVsBW']")
driver.execute_script("arguments[0].scrollIntoView();", aux[12])
time.sleep(1)
driver.execute_script("arguments[0].scrollIntoView();", aux[0])
time.sleep(1)

house_buttons = driver.find_elements_by_xpath("//div[@class='sc-1qwl1yl-0 igVsBW']")

for house_button in house_buttons:
    if (not 'Sem tempo pra procurar' in house_button.text) and (not 'Ainda não encontrou seu lar' in house_button.text):

        house_button.click()

        #Wait for new tab
        time.sleep(2)

        #Switch to it
        driver.switch_to.window(driver.window_handles[1])

        #Wait page load its infos
        time.sleep(4)

        try:
            title = driver.find_element_by_xpath("//h1[@class='sc-1q9n36n-0 ghXeyc sc-bdVaJa hgGleC']").text
            address = driver.find_element_by_xpath("//p[@data-testid='listing-address-subtitle']").text
        except:
            title = address = np.nan
        
        #General Infos
        try:
            infos = driver.find_elements_by_xpath("//div[@class='MuiGrid-root tptht-0 fAvqys MuiGrid-item MuiGrid-grid-xs-3 MuiGrid-grid-sm-3 MuiGrid-grid-md-1']")
            size = infos[0].text
            bedroom = infos[1].text
            bathroom = infos[2].text
            garage = infos[3].text
            floor = infos[4].text
            pet = infos[5].text
            furniture = infos[6].text
            subway = infos[7].text
        except:
            size = bedroom = bathroom = garage = floor = pet = furniture = subway = np.nan

        #Price Infos
        infos = driver.find_elements_by_xpath("//li[contains(@class, 'MuiListItem-root rf1epz-0')]")
        for info in infos:
            if 'Aluguel' in info.text: rent = info.text
            elif 'Condomínio' in info.text: other = info.text
            elif 'IPTU' in info.text: taxes = info.text
            elif 'Seguro incêndio' in info.text: insurance = info.text
            elif 'Taxa de serviço' in info.text: services = info.text
            elif 'Total' in info.text: total = info.text

        houses.append({
            "Title":title,
            "Address":address,
            "Size":size,
            "Bedroom":bedroom,
            "Garage":garage,
            "Floor":floor,
            "Pet":pet,
            "Size":size,
            "Subway":subway,
            "Rent":rent,
            "Other":other,
            "Taxes":taxes,
            "Insurance":insurance,
            "Services":services,
            "Total":total
        })

        #Close Tab and go back to main
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        time.sleep(.5)

Solution

What you have to do is:

find the house button you need
scroll down to this button
click the button, switch tab, get the data, go back to the main tab
go to step 1.

Here's the code to do this (except for extracting the data):

    from selenium import webdriver
    import time
    import numpy as np


    url = 'https://www.quintoandar.com.br/alugar/imovel/sao-paulo-sp-brasil'
    xpath_house_buttons = "//div[@class='sc-1qwl1yl-0 igVsBW']"
    x_path_title = "//h1[@class='sc-1q9n36n-0 ghXeyc sc-bdVaJa hgGleC']"
    x_path_address = "//p[@data-testid='listing-address-subtitle']"
    num_houses = 40
    houses = []


    def scroll_to_house_button(driver, num_btn) -> bool:
        """
        returns true if it could scroll to the house button
        """
        try:
            house_buttons = driver.find_elements_by_xpath(xpath_house_buttons)
            driver.execute_script("arguments[0].scrollIntoView();", house_buttons[num_btn])
            return True
        except:
            return False
    def switch_to_house_tab(driver) -> bool:
        """
        returns true if switching tab was successful
        """
        try:
            driver.switch_to.window(driver.window_handles[1])
            return True
        except:
            return False
    def switch_to_main_tab(driver) -> bool:
        """
        returns true if switching tab was successful
        """
        try:
            driver.switch_to.window(driver.window_handles[0])
            return True
        except:
            return False
    def get_house_button_index(house_buttons, houses_scraped_text, index):
        """
        returns house_button's index in house_buttons
        """
        # at the beginning, the house to scrape is given by its index
        if len(houses_scraped_text) < 5:
            return index
        # afterwards, we try to find by comparing the buttons' content
        else:
            for i in reversed(range(len(house_buttons))):
                if (house_buttons[i].text == houses_scraped_text[-1]) and (house_buttons[i - 1].text == houses_scraped_text[-2]):
                    return i + 1


    # Initializing the webdriver
    driver = webdriver.Chrome()
    driver.set_window_size(1600, 1024)
    driver.get(url)


    # get data
    i = 0
    houses_scraped_text = []
    while len(houses) < num_houses:
        house_buttons = driver.find_elements_by_xpath(xpath_house_buttons)

        # as house_buttons never exceeds a length of 30,
        # we need a smart way of getting the next one
        index_btn = get_house_button_index(house_buttons, houses_scraped_text, i)
        house_button = house_buttons[index_btn]
        houses_scraped_text.append(house_button.text)

        # can't scroll to house button => wait 1 sec
        while not scroll_to_house_button(driver, num_btn=index_btn):
            time.sleep(1)
        print("scroll to house -- house", i + 1)

        # filter houses to be scraped
        if not ((not 'Sem tempo pra procurar' in house_button.text) and (not 'Ainda não encontrou seu lar' in house_button.text)):
            print("house filtered -- house", i + 1, "\n")
            i += 1 # you have to increment here to not loop over the same house forever
            continue

        # new house tab not open yet => wait 1 sec
        while len(driver.window_handles) != 2: # check number of open tabs
            house_button.click()
            time.sleep(1)
        print("new house tab opened -- house", i + 1)

        # can't switch yet => wait 1 sec
        while not switch_to_house_tab(driver):
            time.sleep(1)
        print("switched to new house tab -- house", i + 1)

        ##################
        # LOAD DATA HERE #
        ##################
        print("data loaded -- house", i + 1)

        # close tab
        driver.close()
        # can't switch back to main => wait 1 sec
        while not switch_to_main_tab(driver):
            time.sleep(1)
        print("house tab closed & switched back to main -- house", i + 1)
        
        print(len(houses), "house scraped\n")
        i += 1