Search code examples
pythonscrapy

Scrapy chain of requests to combine items from multiple requests


I am trying to combine the item with item field similarIdeas which is a list right now I am using requests to get the data but I need to yield those requests by chaining them and yield one single item not sure how to do it correctly. Here is my code using requests module:

import scrapy
from scrapy.selector import Selector
import json
import math
import requests


class HouzzScraper(scrapy.Spider):
    name = "houzz"

    # custom settings
    custom_settings = {
        "LOG_FILE": "houzz_spider.log",
        "IMAGES_STORE": "houzz_images",
        "FEEDS": {
            "houzz.json": {
                "format": "json",
            }
        },
        "ITEM_PIPELINES": {
            "houzz_crawler.pipelines.HouzzImagePipeline": 1,
        },
    }

    headers = {
        "authority": "www.houzz.com",
        "accept": "*/*",
        "accept-language": "en,ru;q=0.9",
        "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
        "origin": "https://www.houzz.com",
        "referer": "https://www.houzz.com/photos/columbus-ave-residence-contemporary-bathroom-new-york-phvw-vp~160668148",
        "rrid": "70402547-c900-47f7-a913-8e1cbc9aa0c3",
        "sec-ch-ua": '"Chromium";v="110", "Not A(Brand";v="24", "YaBrowser";v="23"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Linux"',
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.1.906 (beta) Yowser/2.5 Safari/537.36",
        "x-csrf-token": "i8B5ykgX-eprPj5yAHSxOng08Pa4qAr2Z0TQ",
        "x-hz-request": "true",
        "x-ol-exp-id": "clhhdi4wu00003y71rnvty395",
        "x-ol-exp-name": "Photo - View",
        "x-ol-ext-device-id": "23a3cfb8-7a04-4462-af71-d98689271533",
        "x-ol-ext-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
        "x-ol-product": "Houzz",
        "x-ol-product-variant": "Houzz US",
        "x-ol-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
        "x-requested-with": "XMLHttpRequest",
    }

    cookies = {
        "v": "1683311076_f9d9a715-f45b-42dc-bc6d-7da75774a57f_9bda9dd500ca1e5119bbecaba51e53f0",
        "vct": "en-US-vxnkSVVkSBzkSVVkCR%2FkSVVk8B%2FkSVVk4R3kSVVk4h3kSVVk",
        "_gcl_au": "1.1.17413922.1683311086",
        "crossdevicetracking": "915374c0-439c-46a1-bbf2-3a2aaa487e69",
        "_pin_unauth": "dWlkPU16Y3dNbVF6T0dNdE1tWTBOaTAwWTJSa0xUazVZakV0TXprek5XWm1ZV014WWprMw",
        "_sp_id.c905": "5af74097-a6bb-46e7-8d14-35ff6d738f39.1683317411.2.1683359810.1683317411.13ad94c9-5560-4fbf-963f-b63e32f2124d",
        "g_state": '{"i_p":1684144918349,"i_l":3}',
        "browseResultSetGridWidth": "554",
        "_gid": "GA1.2.1176067560.1683652076",
        "ln_or": "eyIzODE1NzE2IjoiZCJ9",
        "_csrf": "G_nV-Kaa7rlqgTwnueAXkJtj",
        "jdv": "t7WOzUb2vHLZtWVVHSk%2BXJEWN7ua9zR%2FUkXpY9RYDUW00hxMyur5c%2Bzn6M%2BqQADtWOInJpmlQA37Gxp0L267jdj74Iwe",
        "documentWidth": "1318",
        "_uetsid": "0bf41840ee8c11edac06995ca98afa3c",
        "_uetvid": "1e07d960eb7211ed880b7db3cdc86191",
        "_derived_epik": "dj0yJnU9NFBDc3RuOExta3NiM2xfaV9WS0RYbVVLRS1lRVpycDEmbj1tVE1RRUtOUjYwYU1Kalp0el9mNTBBJm09OCZ0PUFBQUFBR1JiUmprJnJtPTgmcnQ9QUFBQUFHUmJSamsmc3A9NQ",
        "IR_gbd": "houzz.com",
        "IR_5454": "1683703358356%7C0%7C1683703358356%7C%7C",
        "_ga": "GA1.2.1658927820.1683311086",
        "_dc_gtm_UA-3519678-1": "1",
        "_ga_PB0RC2CT7B": "GS1.1.1683703353.11.1.1683704001.59.0.0",
        "hzd": "70402547-c900-47f7-a913-8e1cbc9aa0c3%3A%3A%3A%3A%3ASeeMoreIdeas",
    }

    base_url = "https://www.houzz.com/photos/home-design-ideas-phbr0-bp~"

    similar_ideas_api_url = "https://www.houzz.com/j/getSimilarSpaces"

    def start_requests(self):
        yield scrapy.Request(
            url=self.base_url, headers=self.headers, callback=self.parse_ideas
        )

    def parse_ideas(self, response):
        ideas = response.css("a.hz-photo-card__ratio-box::attr(href)").extract()
        total_photos = int(
            response.css("span.hz-top-pagination__text ::text")
            .extract()[4]
            .replace(",", "")
        )
        photos_per_page = int(
            response.css("span.hz-top-pagination__text ::text").extract()[2]
        )

        for idea in ideas:
            yield scrapy.Request(
                url=idea, headers=self.headers, callback=self.parse_project_url
            )

    def parse_project_url(self, response):
        data = response.css('script[id="hz-ctx"] ::text').get()
        json_data = json.loads(data)
        space_id = json_data["data"]["pageContentData"]["spaceId"]
        space = json_data["data"]["stores"]["data"]["SpaceStore"]["data"][space_id]
        project_id = space["projectId"]
        space_url = space["url"]
        raw_project_url = (
            space_url.split("~")[0].replace("phvw", "pj").replace("vp", "vj")
        )
        project_url = raw_project_url + "~" + str(project_id)

        yield scrapy.Request(
            url=project_url, headers=self.headers, callback=self.parse_project_idea
        )

    def parse_project_idea(self, response):
        idea_board = response.css(
            "div.hz-prj-container.hz-prj-container__photos.clearfix ::attr(href)"
        ).extract()

        for idea_link in idea_board:
            yield scrapy.Request(
                url=idea_link,
                dont_filter=True,
                headers=self.headers,
                callback=self.parse_idea_details,
            )

    def parse_idea_details(self, response):
        item = {}
        item["ideaUrl"] = response.url
        item["Title"] = response.css(
            "h1.hz-view-photo__space-info__title.text-bold::text"
        ).get()
        subtitle = response.css(
            "h1.hz-view-photo__space-info__subtitle.text-m::text"
        ).get()
        item["subTitle"] = subtitle
        item["spaceDescription"] = response.css(
            "div.hz-view-photo__space-info__description.text-m ::text"
        ).get()
        item["uploadedBy"] = response.css("div.vph-owner-info__details ::text").get()
        item["Tags"] = [
            {"tag": t}
            for t in response.css(
                "ul.hz-view-photo__breadcrumb.hz-track-me ::text"
            ).extract()
        ]
        item["starRating"] = len(
            response.css(
                "span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
            )
        )
        item["numberOfReviews"] = response.css(
            "span.hz-star-rate__review-string::text"
        ).get()
        item["image_urls"] = response.css(
            "div.view-photo-image-pane > img::attr(src)"
        ).extract()
        item["similarIdeas"] = []

        spaceId = response.url.split("~")[-1]
        data = {
            "spaceId": spaceId,
            "fromItem": "0",
            "itemsPerPage": "10",
            "contentDescriptor": '{"t":1,"et":12,"id":6258114}',
        }
        resp = requests.post(
            self.similar_ideas_api_url,
            cookies=self.cookies,
            headers=self.headers,
            data=data,
        )
        data = resp.json()["spaceData"]["spaces"]
        space_keys = list(data.keys())
        space_urls = [data[key]["url"] for key in space_keys]
        for s_url in space_urls:
            space_response = requests.get(url=s_url, headers=self.headers)
            similar_space = Selector(text=space_response.text)
            item["similarIdeas"].append(
                {
                    "ideaUrl": space_response.url,
                    "Title": similar_space.css(
                        "h1.hz-view-photo__space-info__title.text-bold::text"
                    ).get(),
                    "SubTitle": similar_space.css(
                        "h1.hz-view-photo__space-info__subtitle.text-m::text"
                    ).get(),
                    "uploadedBy": similar_space.css(
                        "div.vph-owner-info__details ::text"
                    ).get(),
                    "Tags": [
                        {"tag": t}
                        for t in similar_space.css(
                            "ul.hz-view-photo__breadcrumb.hz-track-me ::text"
                        ).extract()
                    ],
                    "starRating": len(
                        response.css(
                            "span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
                        )
                    ),
                    "numberOfReviews": response.css(
                        "span.hz-star-rate__review-string::text"
                    ).get(),
                    "image_urls": similar_space.css(
                        "div.view-photo-image-pane > img::attr(src)"
                    ).extract(),
                }
            )

        yield item

I tried to join the requests by adding item to meta in the requests but it didn't worked as expected it is only collecting only one similarIdeas and it should be at least 25 or 24 and giving a lot of duplicates despite the fact I haven't place dont_filter=True. Here is my code trying chaining requests:

import scrapy
from scrapy.selector import Selector
import json
import math
import requests


class HouzzSimilar(scrapy.Spider):
    name = "houzz_s"

    # custom settings
    custom_settings = {
        "LOG_FILE": "houzz_spider.log",
        "IMAGES_STORE": "houzz_images",
        "FEEDS": {
            "houzz.json": {
                "format": "json",
            }
        },
        "ITEM_PIPELINES": {
            "houzz_crawler.pipelines.HouzzImagePipeline": 1,
        },
    }

    headers = {
        "authority": "www.houzz.com",
        "accept": "*/*",
        "accept-language": "en,ru;q=0.9",
        "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
        "origin": "https://www.houzz.com",
        "referer": "https://www.houzz.com/photos/columbus-ave-residence-contemporary-bathroom-new-york-phvw-vp~160668148",
        "rrid": "70402547-c900-47f7-a913-8e1cbc9aa0c3",
        "sec-ch-ua": '"Chromium";v="110", "Not A(Brand";v="24", "YaBrowser";v="23"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Linux"',
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.1.906 (beta) Yowser/2.5 Safari/537.36",
        "x-csrf-token": "i8B5ykgX-eprPj5yAHSxOng08Pa4qAr2Z0TQ",
        "x-hz-request": "true",
        "x-ol-exp-id": "clhhdi4wu00003y71rnvty395",
        "x-ol-exp-name": "Photo - View",
        "x-ol-ext-device-id": "23a3cfb8-7a04-4462-af71-d98689271533",
        "x-ol-ext-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
        "x-ol-product": "Houzz",
        "x-ol-product-variant": "Houzz US",
        "x-ol-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
        "x-requested-with": "XMLHttpRequest",
    }

    cookies = {
        "v": "1683311076_f9d9a715-f45b-42dc-bc6d-7da75774a57f_9bda9dd500ca1e5119bbecaba51e53f0",
        "vct": "en-US-vxnkSVVkSBzkSVVkCR%2FkSVVk8B%2FkSVVk4R3kSVVk4h3kSVVk",
        "_gcl_au": "1.1.17413922.1683311086",
        "crossdevicetracking": "915374c0-439c-46a1-bbf2-3a2aaa487e69",
        "_pin_unauth": "dWlkPU16Y3dNbVF6T0dNdE1tWTBOaTAwWTJSa0xUazVZakV0TXprek5XWm1ZV014WWprMw",
        "_sp_id.c905": "5af74097-a6bb-46e7-8d14-35ff6d738f39.1683317411.2.1683359810.1683317411.13ad94c9-5560-4fbf-963f-b63e32f2124d",
        "g_state": '{"i_p":1684144918349,"i_l":3}',
        "browseResultSetGridWidth": "554",
        "_gid": "GA1.2.1176067560.1683652076",
        "ln_or": "eyIzODE1NzE2IjoiZCJ9",
        "_csrf": "G_nV-Kaa7rlqgTwnueAXkJtj",
        "jdv": "t7WOzUb2vHLZtWVVHSk%2BXJEWN7ua9zR%2FUkXpY9RYDUW00hxMyur5c%2Bzn6M%2BqQADtWOInJpmlQA37Gxp0L267jdj74Iwe",
        "documentWidth": "1318",
        "_uetsid": "0bf41840ee8c11edac06995ca98afa3c",
        "_uetvid": "1e07d960eb7211ed880b7db3cdc86191",
        "_derived_epik": "dj0yJnU9NFBDc3RuOExta3NiM2xfaV9WS0RYbVVLRS1lRVpycDEmbj1tVE1RRUtOUjYwYU1Kalp0el9mNTBBJm09OCZ0PUFBQUFBR1JiUmprJnJtPTgmcnQ9QUFBQUFHUmJSamsmc3A9NQ",
        "IR_gbd": "houzz.com",
        "IR_5454": "1683703358356%7C0%7C1683703358356%7C%7C",
        "_ga": "GA1.2.1658927820.1683311086",
        "_dc_gtm_UA-3519678-1": "1",
        "_ga_PB0RC2CT7B": "GS1.1.1683703353.11.1.1683704001.59.0.0",
        "hzd": "70402547-c900-47f7-a913-8e1cbc9aa0c3%3A%3A%3A%3A%3ASeeMoreIdeas",
    }

    base_url = "https://www.houzz.com/photos/home-design-ideas-phbr0-bp~"

    similar_ideas_api_url = "https://www.houzz.com/j/getSimilarSpaces"

    def start_requests(self):
        yield scrapy.Request(
            url=self.base_url, headers=self.headers, callback=self.parse_ideas
        )

    def parse_ideas(self, response):
        ideas = response.css("a.hz-photo-card__ratio-box::attr(href)").extract()
        total_photos = int(
            response.css("span.hz-top-pagination__text ::text")
            .extract()[4]
            .replace(",", "")
        )
        photos_per_page = int(
            response.css("span.hz-top-pagination__text ::text").extract()[2]
        )

        for idea in ideas:
            yield scrapy.Request(
                url=idea, headers=self.headers, callback=self.parse_project_url
            )

    def parse_project_url(self, response):
        data = response.css('script[id="hz-ctx"] ::text').get()
        json_data = json.loads(data)
        space_id = json_data["data"]["pageContentData"]["spaceId"]
        space = json_data["data"]["stores"]["data"]["SpaceStore"]["data"][space_id]
        project_id = space["projectId"]
        space_url = space["url"]
        raw_project_url = (
            space_url.split("~")[0].replace("phvw", "pj").replace("vp", "vj")
        )
        project_url = raw_project_url + "~" + str(project_id)

        yield scrapy.Request(
            url=project_url, headers=self.headers, callback=self.parse_project_idea
        )

    def parse_project_idea(self, response):
        idea_board = response.css(
            "div.hz-prj-container.hz-prj-container__photos.clearfix ::attr(href)"
        ).extract()

        for idea_link in idea_board:
            yield scrapy.Request(
                url=idea_link,
                headers=self.headers,
                callback=self.parse_idea_details,
            )

    def parse_idea_details(self, response):
        item = {}
        item["ideaUrl"] = response.url
        item["Title"] = response.css(
            "h1.hz-view-photo__space-info__title.text-bold::text"
        ).get()
        subtitle = response.css(
            "h1.hz-view-photo__space-info__subtitle.text-m::text"
        ).get()
        item["subTitle"] = subtitle
        item["spaceDescription"] = response.css(
            "div.hz-view-photo__space-info__description.text-m ::text"
        ).get()
        item["uploadedBy"] = response.css("div.vph-owner-info__details ::text").get()
        item["Tags"] = [
            {"tag": t}
            for t in response.css(
                "ul.hz-view-photo__breadcrumb.hz-track-me ::text"
            ).extract()
        ]
        item["starRating"] = len(
            response.css(
                "span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
            )
        )
        item["numberOfReviews"] = response.css(
            "span.hz-star-rate__review-string::text"
        ).get()
        item["image_urls"] = response.css(
            "div.view-photo-image-pane > img::attr(src)"
        ).extract()
        item["similarIdeas"] = []

        spaceId = response.url.split("~")[-1]
        body = f"spaceId={spaceId}&fromItem=0&itemsPerPage=10&contentDescriptor=%7B%22t%22%3A1%2C%22et%22%3A3%2C%22id%22%3A160668148%7D"
        yield scrapy.Request(
            url=self.similar_ideas_api_url,
            method="POST",
            cookies=self.cookies,
            headers=self.headers,
            body=body,
            meta={"item": item},
            callback=self.get_similar_ideas_urls,
        )

    def get_similar_ideas_urls(self, response):
        data = response.json()["spaceData"]["spaces"]
        space_keys = list(data.keys())
        space_urls = [data[key]["url"] for key in space_keys]
        item = response.meta.get("item")

        yield scrapy.Request(
            url=space_urls[0],
            headers=self.headers,
            meta={"item": item, "space_urls": space_urls[1:]},
            callback=self.parse_similar_ideas,
        )

    def parse_similar_ideas(self, response):
        item = response.meta.get("item")
        space_urls = response.meta.get("space_urls")
        item["similarIdeas"].append(
            {
                "ideaUrl": response.url,
                "Title": response.css(
                    "h1.hz-view-photo__space-info__title.text-bold::text"
                ).get(),
                "subTitle": response.css(
                    "h1.hz-view-photo__space-info__subtitle.text-m::text"
                ).get(),
                "spaceDescription": response.css(
                    "div.hz-view-photo__space-info__description.text-m ::text"
                ).get(),
                "uploadedBy": response.css("div.vph-owner-info__details ::text").get(),
                "Tags": [
                    {"tag": t}
                    for t in response.css(
                        "ul.hz-view-photo__breadcrumb.hz-track-me ::text"
                    ).extract()
                ],
                "starRating": len(
                    response.css(
                        "span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
                    )
                ),
                "numberOfReviews": response.css(
                    "span.hz-star-rate__review-string::text"
                ).get(),
                "image_urls": response.css(
                    "div.view-photo-image-pane > img::attr(src)"
                ).extract(),
            }
        )

        if len(space_urls) > 0:
            yield scrapy.Request(
                url=space_urls.pop(0),
                headers=self.headers,
                meta={"item": item, "space_urls": space_urls[1:]},
                callback=self.parse_similar_ideas,
            )

        yield item

my expected output: https://jsoneditoronline.org/#left=cloud.6a9b829e90014b55975756556c3d0f2d


Solution

  • Your second example is really close. THere are just a couple of things I would recommend and one thing that is missing:

    1. You should use cb_kwargs to pass data between callback methods, instead of using the meta dict. Both will work but using the cb_kwargs is what scrapy recommends in this situation and I believe it makes it more readable and requires fewer lines of code.

    2. When running your second example I ran into quite a few situations where the dupelicates filter was triggered. When chaining requests like this having single request filtered will likely mean that item will never be yielded. In order to avoid this you should make your space_urls variable a set instead of a list so you know each url is unique, and you should also add the dont_filter argument to the requests in your parse_similar_ideas method.

    3. THe last and most important point is that you are yielding an item at the end of every call to parse_similar_ideas, which means you are yielding the same item once for every single url in the space_urls list, and the only thing that is changing is the number of items in the similarIdeas field in your item. What you actually want to do is only yield the item once there are no more urls left in space_urls, then you are only yielding the item once at the very end of the chain.

    The example below implements the above points and creates the output you are expecting. You will want to add your custom_settings back to the example though.

    import scrapy
    import json
    
    class HouzzSimilar(scrapy.Spider):
        name = "houzz"
    
        headers = {
            "authority": "www.houzz.com",
            "accept": "*/*",
            "accept-language": "en,ru;q=0.9",
            "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
            "origin": "https://www.houzz.com",
            "referer": "https://www.houzz.com/photos/columbus-ave-residence-contemporary-bathroom-new-york-phvw-vp~160668148",
            "rrid": "70402547-c900-47f7-a913-8e1cbc9aa0c3",
            "sec-ch-ua": '"Chromium";v="110", "Not A(Brand";v="24", "YaBrowser";v="23"',
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": '"Linux"',
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.1.906 (beta) Yowser/2.5 Safari/537.36",
            "x-csrf-token": "i8B5ykgX-eprPj5yAHSxOng08Pa4qAr2Z0TQ",
            "x-hz-request": "true",
            "x-ol-exp-id": "clhhdi4wu00003y71rnvty395",
            "x-ol-exp-name": "Photo - View",
            "x-ol-ext-device-id": "23a3cfb8-7a04-4462-af71-d98689271533",
            "x-ol-ext-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
            "x-ol-product": "Houzz",
            "x-ol-product-variant": "Houzz US",
            "x-ol-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
            "x-requested-with": "XMLHttpRequest",
        }
    
        cookies = {
            "v": "1683311076_f9d9a715-f45b-42dc-bc6d-7da75774a57f_9bda9dd500ca1e5119bbecaba51e53f0",
            "vct": "en-US-vxnkSVVkSBzkSVVkCR%2FkSVVk8B%2FkSVVk4R3kSVVk4h3kSVVk",
            "_gcl_au": "1.1.17413922.1683311086",
            "crossdevicetracking": "915374c0-439c-46a1-bbf2-3a2aaa487e69",
            "_pin_unauth": "dWlkPU16Y3dNbVF6T0dNdE1tWTBOaTAwWTJSa0xUazVZakV0TXprek5XWm1ZV014WWprMw",
            "_sp_id.c905": "5af74097-a6bb-46e7-8d14-35ff6d738f39.1683317411.2.1683359810.1683317411.13ad94c9-5560-4fbf-963f-b63e32f2124d",
            "g_state": '{"i_p":1684144918349,"i_l":3}',
            "browseResultSetGridWidth": "554",
            "_gid": "GA1.2.1176067560.1683652076",
            "ln_or": "eyIzODE1NzE2IjoiZCJ9",
            "_csrf": "G_nV-Kaa7rlqgTwnueAXkJtj",
            "jdv": "t7WOzUb2vHLZtWVVHSk%2BXJEWN7ua9zR%2FUkXpY9RYDUW00hxMyur5c%2Bzn6M%2BqQADtWOInJpmlQA37Gxp0L267jdj74Iwe",
            "documentWidth": "1318",
            "_uetsid": "0bf41840ee8c11edac06995ca98afa3c",
            "_uetvid": "1e07d960eb7211ed880b7db3cdc86191",
            "_derived_epik": "dj0yJnU9NFBDc3RuOExta3NiM2xfaV9WS0RYbVVLRS1lRVpycDEmbj1tVE1RRUtOUjYwYU1Kalp0el9mNTBBJm09OCZ0PUFBQUFBR1JiUmprJnJtPTgmcnQ9QUFBQUFHUmJSamsmc3A9NQ",
            "IR_gbd": "houzz.com",
            "IR_5454": "1683703358356%7C0%7C1683703358356%7C%7C",
            "_ga": "GA1.2.1658927820.1683311086",
            "_dc_gtm_UA-3519678-1": "1",
            "_ga_PB0RC2CT7B": "GS1.1.1683703353.11.1.1683704001.59.0.0",
            "hzd": "70402547-c900-47f7-a913-8e1cbc9aa0c3%3A%3A%3A%3A%3ASeeMoreIdeas",
        }
    
        base_url = "https://www.houzz.com/photos/home-design-ideas-phbr0-bp~"
    
        similar_ideas_api_url = "https://www.houzz.com/j/getSimilarSpaces"
    
        def start_requests(self):
            yield scrapy.Request(
                url=self.base_url, headers=self.headers, callback=self.parse_ideas
            )
    
        def parse_ideas(self, response):
            ideas = response.css("a.hz-photo-card__ratio-box::attr(href)").extract()
            total_photos = int(
                response.css("span.hz-top-pagination__text ::text")
                .extract()[4]
                .replace(",", "")
            )
            photos_per_page = int(
                response.css("span.hz-top-pagination__text ::text").extract()[2]
            )
    
            for idea in ideas:
                yield scrapy.Request(
                    url=idea, headers=self.headers, callback=self.parse_project_url
                )
    
        def parse_project_url(self, response):
            data = response.css('script[id="hz-ctx"] ::text').get()
            json_data = json.loads(data)
            space_id = json_data["data"]["pageContentData"]["spaceId"]
            space = json_data["data"]["stores"]["data"]["SpaceStore"]["data"][space_id]
            project_id = space["projectId"]
            space_url = space["url"]
            raw_project_url = (
                space_url.split("~")[0].replace("phvw", "pj").replace("vp", "vj")
            )
            project_url = raw_project_url + "~" + str(project_id)
    
            yield scrapy.Request(
                url=project_url, headers=self.headers, callback=self.parse_project_idea
            )
    
        def parse_project_idea(self, response):
            idea_board = response.css(
                "div.hz-prj-container.hz-prj-container__photos.clearfix ::attr(href)"
            ).extract()
    
            for idea_link in idea_board:
                yield scrapy.Request(
                    url=idea_link,
                    headers=self.headers,
                    callback=self.parse_idea_details,
                )
    
        def parse_idea_details(self, response):
            item = {}
            item["ideaUrl"] = response.url
            item["Title"] = response.css(
                "h1.hz-view-photo__space-info__title.text-bold::text"
            ).get()
            subtitle = response.css(
                "h1.hz-view-photo__space-info__subtitle.text-m::text"
            ).get()
            item["subTitle"] = subtitle
            item["spaceDescription"] = response.css(
                "div.hz-view-photo__space-info__description.text-m ::text"
            ).get()
            item["uploadedBy"] = response.css("div.vph-owner-info__details ::text").get()
            item["Tags"] = [
                {"tag": t}
                for t in response.css(
                    "ul.hz-view-photo__breadcrumb.hz-track-me ::text"
                ).extract()
            ]
            item["starRating"] = len(
                response.css(
                    "span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
                )
            )
            item["numberOfReviews"] = response.css(
                "span.hz-star-rate__review-string::text"
            ).get()
            item["image_urls"] = response.css(
                "div.view-photo-image-pane > img::attr(src)"
            ).extract()
            item["similarIdeas"] = []
    
            spaceId = response.url.split("~")[-1]
            body = f"spaceId={spaceId}&fromItem=0&itemsPerPage=10&contentDescriptor=%7B%22t%22%3A1%2C%22et%22%3A3%2C%22id%22%3A160668148%7D"
            yield scrapy.Request(
                url=self.similar_ideas_api_url,
                method="POST",
                cookies=self.cookies,
                headers=self.headers,
                body=body,
                cb_kwargs={"item": item},  # <-- cb_kwargs 
                callback=self.get_similar_ideas_urls,
            )
    
        def get_similar_ideas_urls(self, response, item=None):
            data = response.json()["spaceData"]["spaces"]
            space_keys = list(data.keys())
            space_urls = set([data[key]["url"] for key in space_keys])  # <- set
            yield scrapy.Request(
                url=space_urls.pop(),
                headers=self.headers,
                cb_kwargs={"item": item, "space_urls": space_urls},
                callback=self.parse_similar_ideas,
            )
    
        def parse_similar_ideas(self, response, item=None, space_urls=None):
            item["similarIdeas"].append(
                {
                    "ideaUrl": response.url,
                    "Title": response.css(
                        "h1.hz-view-photo__space-info__title.text-bold::text"
                    ).get(),
                    "subTitle": response.css(
                        "h1.hz-view-photo__space-info__subtitle.text-m::text"
                    ).get(),
                    "spaceDescription": response.css(
                        "div.hz-view-photo__space-info__description.text-m ::text"
                    ).get(),
                    "uploadedBy": response.css("div.vph-owner-info__details ::text").get(),
                    "Tags": [
                        {"tag": t}
                        for t in response.css(
                            "ul.hz-view-photo__breadcrumb.hz-track-me ::text"
                        ).extract()
                    ],
                    "starRating": len(
                        response.css(
                            "span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
                        )
                    ),
                    "numberOfReviews": response.css(
                        "span.hz-star-rate__review-string::text"
                    ).get(),
                    "image_urls": response.css(
                        "div.view-photo-image-pane > img::attr(src)"
                    ).extract(),
                }
            )
            if len(space_urls) > 0:
                yield scrapy.Request(
                    url=space_urls.pop(),
                    headers=self.headers,
                    cb_kwargs={"item": item, "space_urls": space_urls},
                    dont_filter=True,  # <--- add this
                    callback=self.parse_similar_ideas,
                )
            else:   # <--- this was the piece you were missing
                yield item