python selenium-webdriver web-scraping scrapy

getting response 200, but not scraping data using selenium with scrapy

import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from scrapy.selector import Selector
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

class YapoSpider(scrapy.Spider):
    name = 'yapo'
    allowed_domains = ['yapo.cl']
    start_urls = ['https://www.yapo.cl/region-metropolitana/inmuebles/inmuebles/arrendar?tipo-inmueble=departamento,casa&pagina=1']

    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        self.driver = webdriver.Chrome(options= chrome_options)

    def parse(self, response):
        self.driver.get(response.url)

        # parse speed
        incremento = 50 
        velocidad = 0.5  

        # scroll height
        altura_total = self.driver.execute_script("return document.body.scrollHeight")

        for posicion in range(0, altura_total, incremento):
            # scroll
            self.driver.execute_script(f"window.scrollTo(0, {posicion});")
            time.sleep(velocidad)

        # bottom page
        self.driver.execute_script(f"window.scrollTo(0, {altura_total});")
        sel = Selector(text=self.driver.page_source)
        # Selector Scrapy.
        for href in sel.xpath("//a[contains(@class,'card inmo subcategory-1240 category-1000 has-cover is-visible')]/@href").extract():
            url = response.urljoin(href)
            yield scrapy.Request(url, callback=self.parse_dir_contents)

    def parse_dir_contents(self, response):
        title = response.xpath("//h1[@class='my-2 title order-1 ng-star-inserted']/text()").extract_first()
        yield {'title': title}

    def closed(self):
        self.driver.quit()

I have this spider in which I combine selenium with scrapy, cause the loading of the hrefs of each ad is loaded dynamically, so I scroll with selenium to obtain them. When I do scrapy crawl I get response 200 of each ad, but I don't get any data from the ads, I don't know what I'm doing wrong. I am testing the code to obtain the title of each ad, but I need to obtain all the data of the ad, especially latitude and longitude

Solution

We can crawl the ads and respective details without selenium. There are 2 different api calls in the background to get the list of ads and to get the details of individual ad. You can find those requests in the developer tools (ctrl + shift + i) under Network section.

Try below code snippet.

import scrapy
import json
import re


class YapoSpider(scrapy.Spider):
    name = "yapo"
    allowed_domains = ["yapo.cl"]
    start_urls = [
        "https://public-api.yapo.cl/buyers/search?page=0&limit=47&query=%7B%22estateType%22:%5B1,2%5D,%22regionId%22:15,%22category%22:%5B1240%5D%7D&orders=%7B%22orderBy%22:%22listTime%22,%22typeOrder%22:%22desc%22%7D"
    ]

    def __init__(self):
        # headers are common hence kept it here
        # x-txref token is dynamic but still working as expected even if we hardcode it
        headers = {
            "accept": "application/json, text/plain, */*",
            "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
            "origin": "https://www.yapo.cl",
            "referer": "https://www.yapo.cl/",
            "sec-ch-ua": '"Chromium";v="123", "Not:A-Brand";v="8"',
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": '"Linux"',
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-site",
            "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
            "x-chref": "WEB",
            "x-cmref": "client",
            "x-commerce": "Yapo",
            "x-country": "CL",
            "x-domain": "Buyer",
            "x-rhsref": "www.yapo.cl",
            "x-txref": "fcbbbb37-6f67-4cc0-a39a-367769e7eb3b",
        }
        self.headers = headers

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, headers=self.headers)

    def parse(self, response):
        data = response.json()
        ads_list = data["ads"]
        for ad in ads_list:
            list_id = ad["listId"]
            api_url = f"https://public-api.yapo.cl/buyers/items/{list_id}"
            yield scrapy.Request(api_url, headers=self.headers, callback=self.parse_dir_contents)

    def parse_dir_contents(self, response):
        data = response.json()

        # Cleaning title to frame the Ad URL
        title = data["header"]["subject"]
        cleaned_title = title.lower()
        cleaned_title = re.sub(r"[^a-zA-Z0-9\s]", "", cleaned_title)
        cleaned_title = re.sub(r"\s+", " ", cleaned_title)
        cleaned_title = cleaned_title.strip().replace(" ", "-")

        list_id = data["header"]["listId"]

        url = f"https://www.yapo.cl/inmuebles/{cleaned_title}_{list_id}"
        lat_long = data["header"]["geoLocation"]["geoPosition"]
        yield {"url": url, "title": title, "lat_long": lat_long}

OUTPUT:

[{"url": "https://www.yapo.cl/inmuebles/casa-en-maipu_89203270", "title": "Casa en maipu", "lat_long": [""]},
{"url": "https://www.yapo.cl/inmuebles/ap-a-pasos-del-metro-home-estudio-amplio_89274750", "title": "Ap A Pasos Del Metro - Home Estudio Amplio", "lat_long": [""]},
{"url": "https://www.yapo.cl/inmuebles/dos-dormitorios-stgo-centro_89124093", "title": "Dos Dormitorios Stgo Centro", "lat_long": ["-33.45348255391156", "-70.63091039657593"]},
{"url": "https://www.yapo.cl/inmuebles/se-arrienda-dpto-en-san-petersburgo-san-miguel_88635167", "title": "Se arrienda dpto en San Petersburgo -San Miguel", "lat_long": ["-33.5166683", "-70.6478173"]},
.
.
.
{"url": "https://www.yapo.cl/inmuebles/depto-1-dorm-1-bao-balcn_88919143", "title": "Depto, 1 Dorm, 1 Baño, Balcón", "lat_long": ["-33.4441931", "-70.6341324"]},
{"url": "https://www.yapo.cl/inmuebles/metro-carlos-valdovinos-2-dormitorios-1-bao_89302591", "title": "Metro Carlos Valdovinos 2 dormitorios 1 baño", "lat_long": ["-33.4873603", "-70.6198297"]},
{"url": "https://www.yapo.cl/inmuebles/departamento-1-dormitorio-metro-ecuador_89186307", "title": "Departamento 1 Dormitorio Metro Ecuador", "lat_long": [""]}]