import scrapy
from selenium import webdriver
from import Options
from scrapy.selector import Selector
from import By
from import WebDriverWait
from import expected_conditions as EC
import time
class YapoSpider(scrapy.Spider):
name = 'yapo'
allowed_domains = ['']
start_urls = [',casa&pagina=1']
def __init__(self):
chrome_options = Options()
self.driver = webdriver.Chrome(options= chrome_options)
def parse(self, response):
# parse speed
incremento = 50
velocidad = 0.5
# scroll height
altura_total = self.driver.execute_script("return document.body.scrollHeight")
for posicion in range(0, altura_total, incremento):
# scroll
self.driver.execute_script(f"window.scrollTo(0, {posicion});")
# bottom page
self.driver.execute_script(f"window.scrollTo(0, {altura_total});")
sel = Selector(text=self.driver.page_source)
# Selector Scrapy.
for href in sel.xpath("//a[contains(@class,'card inmo subcategory-1240 category-1000 has-cover is-visible')]/@href").extract():
url = response.urljoin(href)
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
title = response.xpath("//h1[@class='my-2 title order-1 ng-star-inserted']/text()").extract_first()
yield {'title': title}
def closed(self):
I have this spider in which I combine selenium with scrapy, cause the loading of the hrefs of each ad is loaded dynamically, so I scroll with selenium to obtain them. When I do scrapy crawl I get response 200 of each ad, but I don't get any data from the ads, I don't know what I'm doing wrong. I am testing the code to obtain the title of each ad, but I need to obtain all the data of the ad, especially latitude and longitude
We can crawl the ads and respective details without selenium. There are 2 different api calls in the background to get the list of ads and to get the details of individual ad. You can find those requests in the developer tools (ctrl + shift + i)
under Network
Try below code snippet.
import scrapy
import json
import re
class YapoSpider(scrapy.Spider):
name = "yapo"
allowed_domains = [""]
start_urls = [
def __init__(self):
# headers are common hence kept it here
# x-txref token is dynamic but still working as expected even if we hardcode it
headers = {
"accept": "application/json, text/plain, */*",
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
"origin": "",
"referer": "",
"sec-ch-ua": '"Chromium";v="123", "Not:A-Brand";v="8"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-site",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36",
"x-chref": "WEB",
"x-cmref": "client",
"x-commerce": "Yapo",
"x-country": "CL",
"x-domain": "Buyer",
"x-rhsref": "",
"x-txref": "fcbbbb37-6f67-4cc0-a39a-367769e7eb3b",
self.headers = headers
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, headers=self.headers)
def parse(self, response):
data = response.json()
ads_list = data["ads"]
for ad in ads_list:
list_id = ad["listId"]
api_url = f"{list_id}"
yield scrapy.Request(api_url, headers=self.headers, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
data = response.json()
# Cleaning title to frame the Ad URL
title = data["header"]["subject"]
cleaned_title = title.lower()
cleaned_title = re.sub(r"[^a-zA-Z0-9\s]", "", cleaned_title)
cleaned_title = re.sub(r"\s+", " ", cleaned_title)
cleaned_title = cleaned_title.strip().replace(" ", "-")
list_id = data["header"]["listId"]
url = f"{cleaned_title}_{list_id}"
lat_long = data["header"]["geoLocation"]["geoPosition"]
yield {"url": url, "title": title, "lat_long": lat_long}
[{"url": "", "title": "Casa en maipu", "lat_long": [""]},
{"url": "", "title": "Ap A Pasos Del Metro - Home Estudio Amplio", "lat_long": [""]},
{"url": "", "title": "Dos Dormitorios Stgo Centro", "lat_long": ["-33.45348255391156", "-70.63091039657593"]},
{"url": "", "title": "Se arrienda dpto en San Petersburgo -San Miguel", "lat_long": ["-33.5166683", "-70.6478173"]},
{"url": "", "title": "Depto, 1 Dorm, 1 Baño, Balcón", "lat_long": ["-33.4441931", "-70.6341324"]},
{"url": "", "title": "Metro Carlos Valdovinos 2 dormitorios 1 baño", "lat_long": ["-33.4873603", "-70.6198297"]},
{"url": "", "title": "Departamento 1 Dormitorio Metro Ecuador", "lat_long": [""]}]