Search code examples
pythonseleniumweb-scrapinginfinite-scroll

Scraping an Infinite Scroll Page


I am trying to webscrape, using python, a rental houses/apartment site pretty famous here in Brazil (5 andar).

I need to enter in each of the elements and scrape some information inside it. Any tips on how to do it? Since its an infinite scroll type of page?

OBS: By now I can already enter each element and scrape the data. my only problem is keep scrolling/ scraping the new data.

Here is the link for the site: https://www.quintoandar.com.br/alugar/imovel/sao-paulo-sp-brasil and a img of it

5andar website

Here is what I have so far. It is already working on the first items

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import numpy as np
import pandas as pd

#Initializing the webdriver
options = webdriver.ChromeOptions()

#Change the path to where chromedriver is in your home folder.
path = 'chromedriver'
driver = webdriver.Chrome(executable_path=path, options=options)
driver.set_window_size(1600, 1024)

url = 'https://www.quintoandar.com.br/alugar/imovel/sao-paulo-sp-brasil'
driver.get(url)
time.sleep(5)

num_houses = 40
houses=[]

#Fix (scrolling the page a few items and going back to initial)
aux = driver.find_elements_by_xpath("//div[@class='sc-1qwl1yl-0 igVsBW']")
driver.execute_script("arguments[0].scrollIntoView();", aux[12])
time.sleep(1)
driver.execute_script("arguments[0].scrollIntoView();", aux[0])
time.sleep(1)

house_buttons = driver.find_elements_by_xpath("//div[@class='sc-1qwl1yl-0 igVsBW']")

for house_button in house_buttons:
    if (not 'Sem tempo pra procurar' in house_button.text) and (not 'Ainda não encontrou seu lar' in house_button.text):

        house_button.click()

        #Wait for new tab
        time.sleep(2)

        #Switch to it
        driver.switch_to.window(driver.window_handles[1])

        #Wait page load its infos
        time.sleep(4)

        try:
            title = driver.find_element_by_xpath("//h1[@class='sc-1q9n36n-0 ghXeyc sc-bdVaJa hgGleC']").text
            address = driver.find_element_by_xpath("//p[@data-testid='listing-address-subtitle']").text
        except:
            title = address = np.nan
        
        #General Infos
        try:
            infos = driver.find_elements_by_xpath("//div[@class='MuiGrid-root tptht-0 fAvqys MuiGrid-item MuiGrid-grid-xs-3 MuiGrid-grid-sm-3 MuiGrid-grid-md-1']")
            size = infos[0].text
            bedroom = infos[1].text
            bathroom = infos[2].text
            garage = infos[3].text
            floor = infos[4].text
            pet = infos[5].text
            furniture = infos[6].text
            subway = infos[7].text
        except:
            size = bedroom = bathroom = garage = floor = pet = furniture = subway = np.nan

        #Price Infos
        infos = driver.find_elements_by_xpath("//li[contains(@class, 'MuiListItem-root rf1epz-0')]")
        for info in infos:
            if 'Aluguel' in info.text: rent = info.text
            elif 'Condomínio' in info.text: other = info.text
            elif 'IPTU' in info.text: taxes = info.text
            elif 'Seguro incêndio' in info.text: insurance = info.text
            elif 'Taxa de serviço' in info.text: services = info.text
            elif 'Total' in info.text: total = info.text

        houses.append({
            "Title":title,
            "Address":address,
            "Size":size,
            "Bedroom":bedroom,
            "Garage":garage,
            "Floor":floor,
            "Pet":pet,
            "Size":size,
            "Subway":subway,
            "Rent":rent,
            "Other":other,
            "Taxes":taxes,
            "Insurance":insurance,
            "Services":services,
            "Total":total
        })

        #Close Tab and go back to main
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        time.sleep(.5)  

      

Solution

  • What you have to do is:

    1. find the house button you need
    2. scroll down to this button
    3. click the button, switch tab, get the data, go back to the main tab
    4. go to step 1.

    Here's the code to do this (except for extracting the data):

        from selenium import webdriver
        import time
        import numpy as np
    
    
        url = 'https://www.quintoandar.com.br/alugar/imovel/sao-paulo-sp-brasil'
        xpath_house_buttons = "//div[@class='sc-1qwl1yl-0 igVsBW']"
        x_path_title = "//h1[@class='sc-1q9n36n-0 ghXeyc sc-bdVaJa hgGleC']"
        x_path_address = "//p[@data-testid='listing-address-subtitle']"
        num_houses = 40
        houses = []
    
    
        def scroll_to_house_button(driver, num_btn) -> bool:
            """
            returns true if it could scroll to the house button
            """
            try:
                house_buttons = driver.find_elements_by_xpath(xpath_house_buttons)
                driver.execute_script("arguments[0].scrollIntoView();", house_buttons[num_btn])
                return True
            except:
                return False
        def switch_to_house_tab(driver) -> bool:
            """
            returns true if switching tab was successful
            """
            try:
                driver.switch_to.window(driver.window_handles[1])
                return True
            except:
                return False
        def switch_to_main_tab(driver) -> bool:
            """
            returns true if switching tab was successful
            """
            try:
                driver.switch_to.window(driver.window_handles[0])
                return True
            except:
                return False
        def get_house_button_index(house_buttons, houses_scraped_text, index):
            """
            returns house_button's index in house_buttons
            """
            # at the beginning, the house to scrape is given by its index
            if len(houses_scraped_text) < 5:
                return index
            # afterwards, we try to find by comparing the buttons' content
            else:
                for i in reversed(range(len(house_buttons))):
                    if (house_buttons[i].text == houses_scraped_text[-1]) and (house_buttons[i - 1].text == houses_scraped_text[-2]):
                        return i + 1
    
    
        # Initializing the webdriver
        driver = webdriver.Chrome()
        driver.set_window_size(1600, 1024)
        driver.get(url)
    
    
        # get data
        i = 0
        houses_scraped_text = []
        while len(houses) < num_houses:
            house_buttons = driver.find_elements_by_xpath(xpath_house_buttons)
    
            # as house_buttons never exceeds a length of 30,
            # we need a smart way of getting the next one
            index_btn = get_house_button_index(house_buttons, houses_scraped_text, i)
            house_button = house_buttons[index_btn]
            houses_scraped_text.append(house_button.text)
    
            # can't scroll to house button => wait 1 sec
            while not scroll_to_house_button(driver, num_btn=index_btn):
                time.sleep(1)
            print("scroll to house -- house", i + 1)
    
            # filter houses to be scraped
            if not ((not 'Sem tempo pra procurar' in house_button.text) and (not 'Ainda não encontrou seu lar' in house_button.text)):
                print("house filtered -- house", i + 1, "\n")
                i += 1 # you have to increment here to not loop over the same house forever
                continue
    
            # new house tab not open yet => wait 1 sec
            while len(driver.window_handles) != 2: # check number of open tabs
                house_button.click()
                time.sleep(1)
            print("new house tab opened -- house", i + 1)
    
            # can't switch yet => wait 1 sec
            while not switch_to_house_tab(driver):
                time.sleep(1)
            print("switched to new house tab -- house", i + 1)
    
            ##################
            # LOAD DATA HERE #
            ##################
            print("data loaded -- house", i + 1)
    
            # close tab
            driver.close()
            # can't switch back to main => wait 1 sec
            while not switch_to_main_tab(driver):
                time.sleep(1)
            print("house tab closed & switched back to main -- house", i + 1)
            
            print(len(houses), "house scraped\n")
            i += 1