Search code examples
pythonselenium-webdriverweb-scrapingscrapy

getting response 200, but not scraping data using selenium with scrapy


import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from scrapy.selector import Selector
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

class YapoSpider(scrapy.Spider):
    name = 'yapo'
    allowed_domains = ['yapo.cl']
    start_urls = ['https://www.yapo.cl/region-metropolitana/inmuebles/inmuebles/arrendar?tipo-inmueble=departamento,casa&pagina=1']

    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        self.driver = webdriver.Chrome(options= chrome_options)

    def parse(self, response):
        self.driver.get(response.url)

        # parse speed
        incremento = 50 
        velocidad = 0.5  

        # scroll height
        altura_total = self.driver.execute_script("return document.body.scrollHeight")

        for posicion in range(0, altura_total, incremento):
            # scroll
            self.driver.execute_script(f"window.scrollTo(0, {posicion});")
            time.sleep(velocidad)

        # bottom page
        self.driver.execute_script(f"window.scrollTo(0, {altura_total});")
        sel = Selector(text=self.driver.page_source)
        # Selector Scrapy.
        for href in sel.xpath("//a[contains(@class,'card inmo subcategory-1240 category-1000 has-cover is-visible')]/@href").extract():
            url = response.urljoin(href)
            yield scrapy.Request(url, callback=self.parse_dir_contents)

    def parse_dir_contents(self, response):
        title = response.xpath("//h1[@class='my-2 title order-1 ng-star-inserted']/text()").extract_first()
        yield {'title': title}

    def closed(self):
        self.driver.quit()

I have this spider in which I combine selenium with scrapy, cause the loading of the hrefs of each ad is loaded dynamically, so I scroll with selenium to obtain them. When I do scrapy crawl I get response 200 of each ad, but I don't get any data from the ads, I don't know what I'm doing wrong. I am testing the code to obtain the title of each ad, but I need to obtain all the data of the ad, especially latitude and longitude


Solution

  • We can crawl the ads and respective details without selenium. There are 2 different api calls in the background to get the list of ads and to get the details of individual ad. You can find those requests in the developer tools (ctrl + shift + i) under Network section.

    Try below code snippet.

    import scrapy
    import json
    import re
    
    
    class YapoSpider(scrapy.Spider):
        name = "yapo"
        allowed_domains = ["yapo.cl"]
        start_urls = [
            "https://public-api.yapo.cl/buyers/search?page=0&limit=47&query=%7B%22estateType%22:%5B1,2%5D,%22regionId%22:15,%22category%22:%5B1240%5D%7D&orders=%7B%22orderBy%22:%22listTime%22,%22typeOrder%22:%22desc%22%7D"
        ]
    
        def __init__(self):
            # headers are common hence kept it here
            # x-txref token is dynamic but still working as expected even if we hardcode it
            headers = {
                "accept": "application/json, text/plain, */*",
                "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
                "origin": "https://www.yapo.cl",
                "referer": "https://www.yapo.cl/",
                "sec-ch-ua": '"Chromium";v="123", "Not:A-Brand";v="8"',
                "sec-ch-ua-mobile": "?0",
                "sec-ch-ua-platform": '"Linux"',
                "sec-fetch-dest": "empty",
                "sec-fetch-mode": "cors",
                "sec-fetch-site": "same-site",
                "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
                "x-chref": "WEB",
                "x-cmref": "client",
                "x-commerce": "Yapo",
                "x-country": "CL",
                "x-domain": "Buyer",
                "x-rhsref": "www.yapo.cl",
                "x-txref": "fcbbbb37-6f67-4cc0-a39a-367769e7eb3b",
            }
            self.headers = headers
    
        def start_requests(self):
            for url in self.start_urls:
                yield scrapy.Request(url, headers=self.headers)
    
        def parse(self, response):
            data = response.json()
            ads_list = data["ads"]
            for ad in ads_list:
                list_id = ad["listId"]
                api_url = f"https://public-api.yapo.cl/buyers/items/{list_id}"
                yield scrapy.Request(api_url, headers=self.headers, callback=self.parse_dir_contents)
    
        def parse_dir_contents(self, response):
            data = response.json()
    
            # Cleaning title to frame the Ad URL
            title = data["header"]["subject"]
            cleaned_title = title.lower()
            cleaned_title = re.sub(r"[^a-zA-Z0-9\s]", "", cleaned_title)
            cleaned_title = re.sub(r"\s+", " ", cleaned_title)
            cleaned_title = cleaned_title.strip().replace(" ", "-")
    
            list_id = data["header"]["listId"]
    
            url = f"https://www.yapo.cl/inmuebles/{cleaned_title}_{list_id}"
            lat_long = data["header"]["geoLocation"]["geoPosition"]
            yield {"url": url, "title": title, "lat_long": lat_long}
    

    OUTPUT:

    [{"url": "https://www.yapo.cl/inmuebles/casa-en-maipu_89203270", "title": "Casa en maipu", "lat_long": [""]},
    {"url": "https://www.yapo.cl/inmuebles/ap-a-pasos-del-metro-home-estudio-amplio_89274750", "title": "Ap A Pasos Del Metro - Home Estudio Amplio", "lat_long": [""]},
    {"url": "https://www.yapo.cl/inmuebles/dos-dormitorios-stgo-centro_89124093", "title": "Dos Dormitorios Stgo Centro", "lat_long": ["-33.45348255391156", "-70.63091039657593"]},
    {"url": "https://www.yapo.cl/inmuebles/se-arrienda-dpto-en-san-petersburgo-san-miguel_88635167", "title": "Se arrienda dpto en San Petersburgo -San Miguel", "lat_long": ["-33.5166683", "-70.6478173"]},
    .
    .
    .
    {"url": "https://www.yapo.cl/inmuebles/depto-1-dorm-1-bao-balcn_88919143", "title": "Depto, 1 Dorm, 1 Baño, Balcón", "lat_long": ["-33.4441931", "-70.6341324"]},
    {"url": "https://www.yapo.cl/inmuebles/metro-carlos-valdovinos-2-dormitorios-1-bao_89302591", "title": "Metro Carlos Valdovinos 2 dormitorios 1 baño", "lat_long": ["-33.4873603", "-70.6198297"]},
    {"url": "https://www.yapo.cl/inmuebles/departamento-1-dormitorio-metro-ecuador_89186307", "title": "Departamento 1 Dormitorio Metro Ecuador", "lat_long": [""]}]