import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from scrapy.selector import Selector
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
class YapoSpider(scrapy.Spider):
name = 'yapo'
allowed_domains = ['yapo.cl']
start_urls = ['https://www.yapo.cl/region-metropolitana/inmuebles/inmuebles/arrendar?tipo-inmueble=departamento,casa&pagina=1']
def __init__(self):
chrome_options = Options()
chrome_options.add_argument("--headless")
self.driver = webdriver.Chrome(options= chrome_options)
def parse(self, response):
self.driver.get(response.url)
# parse speed
incremento = 50
velocidad = 0.5
# scroll height
altura_total = self.driver.execute_script("return document.body.scrollHeight")
for posicion in range(0, altura_total, incremento):
# scroll
self.driver.execute_script(f"window.scrollTo(0, {posicion});")
time.sleep(velocidad)
# bottom page
self.driver.execute_script(f"window.scrollTo(0, {altura_total});")
sel = Selector(text=self.driver.page_source)
# Selector Scrapy.
for href in sel.xpath("//a[contains(@class,'card inmo subcategory-1240 category-1000 has-cover is-visible')]/@href").extract():
url = response.urljoin(href)
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
title = response.xpath("//h1[@class='my-2 title order-1 ng-star-inserted']/text()").extract_first()
yield {'title': title}
def closed(self):
self.driver.quit()
I have this spider in which I combine selenium with scrapy, cause the loading of the hrefs of each ad is loaded dynamically, so I scroll with selenium to obtain them. When I do scrapy crawl I get response 200 of each ad, but I don't get any data from the ads, I don't know what I'm doing wrong. I am testing the code to obtain the title of each ad, but I need to obtain all the data of the ad, especially latitude and longitude
We can crawl the ads and respective details without selenium. There are 2 different api calls in the background to get the list of ads and to get the details of individual ad. You can find those requests in the developer tools (ctrl + shift + i)
under Network
section.
Try below code snippet.
import scrapy
import json
import re
class YapoSpider(scrapy.Spider):
name = "yapo"
allowed_domains = ["yapo.cl"]
start_urls = [
"https://public-api.yapo.cl/buyers/search?page=0&limit=47&query=%7B%22estateType%22:%5B1,2%5D,%22regionId%22:15,%22category%22:%5B1240%5D%7D&orders=%7B%22orderBy%22:%22listTime%22,%22typeOrder%22:%22desc%22%7D"
]
def __init__(self):
# headers are common hence kept it here
# x-txref token is dynamic but still working as expected even if we hardcode it
headers = {
"accept": "application/json, text/plain, */*",
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
"origin": "https://www.yapo.cl",
"referer": "https://www.yapo.cl/",
"sec-ch-ua": '"Chromium";v="123", "Not:A-Brand";v="8"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-site",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"x-chref": "WEB",
"x-cmref": "client",
"x-commerce": "Yapo",
"x-country": "CL",
"x-domain": "Buyer",
"x-rhsref": "www.yapo.cl",
"x-txref": "fcbbbb37-6f67-4cc0-a39a-367769e7eb3b",
}
self.headers = headers
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, headers=self.headers)
def parse(self, response):
data = response.json()
ads_list = data["ads"]
for ad in ads_list:
list_id = ad["listId"]
api_url = f"https://public-api.yapo.cl/buyers/items/{list_id}"
yield scrapy.Request(api_url, headers=self.headers, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
data = response.json()
# Cleaning title to frame the Ad URL
title = data["header"]["subject"]
cleaned_title = title.lower()
cleaned_title = re.sub(r"[^a-zA-Z0-9\s]", "", cleaned_title)
cleaned_title = re.sub(r"\s+", " ", cleaned_title)
cleaned_title = cleaned_title.strip().replace(" ", "-")
list_id = data["header"]["listId"]
url = f"https://www.yapo.cl/inmuebles/{cleaned_title}_{list_id}"
lat_long = data["header"]["geoLocation"]["geoPosition"]
yield {"url": url, "title": title, "lat_long": lat_long}
OUTPUT:
[{"url": "https://www.yapo.cl/inmuebles/casa-en-maipu_89203270", "title": "Casa en maipu", "lat_long": [""]},
{"url": "https://www.yapo.cl/inmuebles/ap-a-pasos-del-metro-home-estudio-amplio_89274750", "title": "Ap A Pasos Del Metro - Home Estudio Amplio", "lat_long": [""]},
{"url": "https://www.yapo.cl/inmuebles/dos-dormitorios-stgo-centro_89124093", "title": "Dos Dormitorios Stgo Centro", "lat_long": ["-33.45348255391156", "-70.63091039657593"]},
{"url": "https://www.yapo.cl/inmuebles/se-arrienda-dpto-en-san-petersburgo-san-miguel_88635167", "title": "Se arrienda dpto en San Petersburgo -San Miguel", "lat_long": ["-33.5166683", "-70.6478173"]},
.
.
.
{"url": "https://www.yapo.cl/inmuebles/depto-1-dorm-1-bao-balcn_88919143", "title": "Depto, 1 Dorm, 1 Baño, Balcón", "lat_long": ["-33.4441931", "-70.6341324"]},
{"url": "https://www.yapo.cl/inmuebles/metro-carlos-valdovinos-2-dormitorios-1-bao_89302591", "title": "Metro Carlos Valdovinos 2 dormitorios 1 baño", "lat_long": ["-33.4873603", "-70.6198297"]},
{"url": "https://www.yapo.cl/inmuebles/departamento-1-dormitorio-metro-ecuador_89186307", "title": "Departamento 1 Dormitorio Metro Ecuador", "lat_long": [""]}]