Search code examples
pythonweb-scrapingscrapy

Scrapy: cannot reach second callback function when sending request with query parameter strings


I am scraping engineering blogs from meta. Right now I am just trying to print the title and url for each blog. Thanks for any help

here is what I have done. It doesn't reach parse_loadmore function and doesn't print anything. I have tried copy and paste loadmore_endpoint to the browser and it just works fine, which is supposed to be some html code.

import scrapy
from urllib.parse import urlencode
import pdfkit
import requests
import re
import json
from bs4 import BeautifulSoup
# from ..helpers import generate_pdfs_file_path

options = {
    # 'no-images': None,
    "disable-javascript": None,
    "disable-external-links": None,
    "quiet": None,
    "encoding": "UTF-8",
}


class MetaSpider(scrapy.Spider):
    name = "meta_spider"
    api_endpoint = "https://engineering.fb.com/wp-json/fb/v1/loadmore"
    start_urls = [
        "https://engineering.fb.com/category/core-infra/",
        # "https://engineering.fb.com/category/data-infrastructure/",
        # "https://engineering.fb.com/category/developer-tools/",
        # "https://engineering.fb.com/category/production-engineering/",
        # "https://engineering.fb.com/category/security/",
    ]
    post_fetched = 0

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, self.parse_initial)
    def parse_initial(self, response):
        endpoint, query_args = get_loadmore_endpoints_and_params(response)
        for page in range(4):
            params = {
                "action": "loadmore",
                "queryArgs": json.dumps(query_args),
                "page": page,
                "post_type": "post",
            }
            loadmore_endpoint = get_load_more_posts_url(endpoint, params=params)
            # print(f"Sending Request {loadmore_endpoint}")
            yield scrapy.Request(url=loadmore_endpoint,  callback=self.parse_loadmore)

    def parse_loadmore(self, response):
        print("parse_loadmore called with response: {}".format(response.text))
        # Create a TextResponse object
        for post in response.css("article.post"):
            header = post.css("header.entry-header")
            title = header.css(".entry-title a::text").get().strip()
            url = header.css(".entry-title a::attr(href)").get()

            # Sanitize the title to create a valid filename
            safe_title = re.sub(r"[^\w\s-]", "", title).replace(" ", "_")
            print(f"----title: {safe_title}, url: {url}----")
  


def clean_post_html(soup):
    for script in soup.find_all("script"):
        script.decompose()
    for script in soup.find_all("noscript"):
        script.decompose()
    for element in soup.find_all(class_="sharedaddy"):
        element.decompose()

    image_container = soup.find(id="post-feat-image-container")
    if image_container:
        image_container.decompose()


def get_loadmore_endpoints_and_params(response):
    # Extracting the script content
    script_content = response.xpath(
        '//script[contains(., "loadmore_params")]/text()'
    ).get()

    # Parsing the JavaScript to extract query parameters
    if script_content:
        # Use regular expression to find the JSON object
        params_json = re.search(r"var loadmore_params = (.*?);", script_content)
        if params_json:
            params_string = params_json.group(1)
            params = json.loads(params_string)
            return params["restfulURL"], params["posts"]


def get_load_more_posts_url(url, params):
    query_string = urlencode(params, doseq=True)
    return f"{url}?{query_string}"


Solution

  • There are 2 things needed to achieve your goal.

    1. In the settings.py or in your spiders custom_settings attribute set the default "URLLENGTH_LIMIT" to a higher value than the default - reason for this is because the load more endpoint is a very long URL and exceeds the limits imposed by scrapy by default

    2. The response from your parse_loadmore method is typed as json, so it won't let you run css selectors on it. So the solution would be to initially call response.json() to get the text, and then manually stick the text into a scrapy.Selector and use that to run css and xpath queries on the html inside the string.

    For example:

    import scrapy
    from urllib.parse import urlencode
    
    import re
    import json
    # from ..helpers import generate_pdfs_file_path
    
    options = {
        # 'no-images': None,
        "disable-javascript": None,
        "disable-external-links": None,
        "quiet": None,
        "encoding": "UTF-8",
    }
    
    
    class MetaSpider(scrapy.Spider):
        name = "meta_spider"
        api_endpoint = "https://engineering.fb.com/wp-json/fb/v1/loadmore"
        start_urls = [
            "https://engineering.fb.com/category/core-infra/",
            # "https://engineering.fb.com/category/data-infrastructure/",
            # "https://engineering.fb.com/category/developer-tools/",
            # "https://engineering.fb.com/category/production-engineering/",
            # "https://engineering.fb.com/category/security/",
        ]
        post_fetched = 0
        custom_settings = {
            "URLLENGTH_LIMIT" : 20000
        }
    
        def start_requests(self):
            for url in self.start_urls:
                yield scrapy.Request(url, self.parse_initial)
        def parse_initial(self, response):
            endpoint, query_args = get_loadmore_endpoints_and_params(response)
            for page in range(4):
                params = {
                    "action": "loadmore",
                    "queryArgs": json.dumps(query_args),
                    "page": page,
                    "post_type": "post",
                }
                loadmore_endpoint = get_load_more_posts_url(endpoint, params=params)
                yield scrapy.Request(url=loadmore_endpoint,  callback=self.parse_loadmore)
    
        def parse_loadmore(self, response):
            # print("parse_loadmore called with response: {}".format(response.text))
    
            resp = scrapy.Selector(text=response.json())
    
            for post in resp.css("article.post"):
    
                header = post.css("header.entry-header")
                title = header.css(".entry-title a::text").get().strip()
                url = header.css(".entry-title a::attr(href)").get()
    
                # Sanitize the title to create a valid filename
                safe_title = re.sub(r"[^\w\s-]", "", title).replace(" ", "_")
                print(f"----title: {safe_title}, url: {url}----")
    
    
    
    def clean_post_html(soup):
        for script in soup.find_all("script"):
            script.decompose()
        for script in soup.find_all("noscript"):
            script.decompose()
        for element in soup.find_all(class_="sharedaddy"):
            element.decompose()
    
        image_container = soup.find(id="post-feat-image-container")
        if image_container:
            image_container.decompose()
    
    
    def get_loadmore_endpoints_and_params(response):
        # Extracting the script content
        script_content = response.xpath(
            '//script[contains(., "loadmore_params")]/text()'
        ).get()
        # Parsing the JavaScript to extract query parameters
        if script_content:
            # Use regular expression to find the JSON object
            params_json = re.search(r"var loadmore_params = (.*?);", script_content)
            if params_json:
                params_string = params_json.group(1)
                params = json.loads(params_string)
                return params["restfulURL"], params["posts"]
    
    
    def get_load_more_posts_url(url, params):
        query_string = urlencode(params, doseq=True)
        return f"{url}?{query_string}"
    
    

    PARTIAL OUTPUT

    2023-11-23 19:42:48 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
    2023-11-23 19:42:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://engineering.fb.com/category/core-infra/> (referer: None)
    2023-11-23 19:42:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://engineering.fb.com/wp-json/fb/v1/loadmore?action=loadmore&queryArgs=%22%7B%5C%22category_name%5C%22%3A%5C%22core-infra%5C%22%2C%5C%22error%5C%
    22%3A%5C%22%5C%22%2C%5C%22m%5C%22%3A%5C%22%5C%22%2C%5C%22p%5C%22%3A0%2C%5C%22post_parent%5C%22%3A%5C%22%5C%22%2C%5C%22subpost%5C%22%3A%5C%22%5C%22%2C%5C%22subpost_id%5C%22%3A%5C%22%5C%22%2C%5C%22attachment%5C%22%3A%5C....5C%22order%5C%22%3A%5C%22DESC%5C%22%7D%22&page=3&post_type=post> (referer: https://engineering.fb.com/category/core-infra/)
    2023-11-23 19:42:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://engineering.fb.com/wp-json/fb/v1/loadmore?action=loadmore&queryArgs=%22%7B%5C%22category_name%5C%22%3A%5C%22core-infra%5C%22%2C%5C%22error%5C%
    22%3A%5C%22%5C%22%2C%5C%22m%5C%22%3A%5C%22%5C%22%2C%5C%22p%5C%22%3A0%2C%5C%22post_parent%5C%22%3A%5C%22%5C%22%2C%5C%22subpost%5C%22%3A%5C%22%5C%22%2C%5C%22subpost_id%5C%22%3A%5C%22%5C%22%2C%5C%22attachment%5C%22%3A%5C....%22&page=0&post_type=post> (referer: https://engineering.fb.com/category/core-infra/)
    2023-11-23 19:42:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://engineering.fb.com/wp-json/fb/v1/loadmore?action=loadmore&queryArgs=%22%7B%5C%22category_name%5C%22%3A%5C%22core-infra%5C%22%2C%5C%22error%5C%
    22%3A%5C%22...e%5C%22%3A%5C%2250%5C%22%2C%5C%22no_found_rows%5C%22%3Afalse%2C%5C
    %22order%5C%22%3A%5C%22DESC%5C%22%7D%22&page=1&post_type=post> (referer: https://engineering.fb.com/category/core-infra/)
    2023-11-23 19:42:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://engineering.fb.com/wp-json/fb/v1/loadmore?action=loadmore&queryArgs=%22%7B%5C%22category_name%5C%22%3A%5C%22core-infra%5C%22%2C%5C%22error%5C%
    22%3A%5C%22%5C...2%3Atrue%2C%5C%22post_type%5C%22%3A%5C%22%5C%22%2C%5C%22posts_per_page%5C%22%3A12%2C%5C%22nopaging%5C%22%3Afalse%2C%5C%22comments_per_page%5C%22%3A%5C%2250%5C%22%2C%5C%22no_found_rows%5C%22%3Afalse%2C%5C
    %22order%5C%22%3A%5C%22DESC%5C%22%7D%22&page=2&post_type=post> (referer: https://engineering.fb.com/category/core-infra/)
    ----title: Introducing_Velox_An_open_source_unified_execution_engine, url: https://engineering.fb.com/2023/03/09/open-source/velox-open-source-execution-engine/----
    ----title: Metas_head_of_AR_hardware_on_the_future_of_AR, url: https://engineering.fb.com/2023/02/24/virtual-reality/ar-vr-meta-caitlin-kalinowski/----
    ----title: How_Meta_brought_AV1_to_Reels, url: https://engineering.fb.com/2023/02/21/video-engineering/av1-codec-facebook-instagram-reels/----
    ----title: Inside_Metas_first_smart_glasses, url: https://engineering.fb.com/2023/02/16/virtual-reality/developing-meta-rayban-stories/----
    ----title: Building_a_cross-platform_runtime_for_AR, url: https://engineering.fb.com/2023/02/13/virtual-reality/meta-ar-augmented-reality-cross-platform-runtime/----
    ----title: Improving_Metas_global_maps, url: https://engineering.fb.com/2023/02/07/web/basemap-facebook-instagram-whatsapp-improvements/----
    ----title: The_evolution_of_Facebooks_iOS_app_architecture, url: https://engineering.fb.com/2023/02/06/ios/facebook-ios-app-architecture/----
    ----title: Asynchronous_computing_at_Meta_Overview_and_learnings, url: https://engineering.fb.com/2023/01/31/production-engineering/meta-asynchronous-computing/----
    ----title: Watch_Metas_engineers_discuss_optimizing_large-scale_networks, url: https://engineering.fb.com/2023/01/27/networking-traffic/optimizing-large-scale-networks-meta-engineers/----
    ----title: Tulip_Modernizing_Metas_data_platform, url: https://engineering.fb.com/2023/01/26/data-infrastructure/tulip-modernizing-metas-data-platform/----
    ----title: Open-sourcing_Anonymous_Credential_Service, url: https://engineering.fb.com/2022/12/12/security/anonymous-credential-service-acs-open-source/----
    ----title: Enabling_static_analysis_of_SQL_queries_at_Meta, url: https://engineering.fb.com/2022/11/30/data-infrastructure/static-analysis-sql-queries/----
    ----title: Writing_and_linting_Python_at_scale, url: https://engineering.fb.com/2023/11/21/production-engineering/writing-linting-python-at-scale-meta/----
    ----title: Watch_Metas_engineers_on_building_network_infrastructure_for_AI, url: https://engineering.fb.com/2023/11/15/networking-traffic/watch-metas-engineers-on-building-network-infrastructure-for-ai/----
    ----title: Enhancing_the_security_of_WhatsApp_calls, url: https://engineering.fb.com/2023/11/08/security/whatsapp-calls-enhancing-security/----
    ----title: How_Meta_built_Threads_in_5_months, url: https://engineering.fb.com/2023/11/06/android/how-meta-built-threads-in-5-months/----
    ----title: Automating_data_removal, url: https://engineering.fb.com/2023/10/31/data-infrastructure/automating-data-removal/----
    ----title: Automating_dead_code_cleanup, url: https://engineering.fb.com/2023/10/24/data-infrastructure/automating-dead-code-cleanup/----
    ----title: 5_Things_you_didnt_know_about_Buck2, url: https://engineering.fb.com/2023/10/23/developer-tools/5-things-you-didnt-know-about-buck2/----
    ----title: How_Meta_is_creating_custom_silicon_for_AI, url: https://engineering.fb.com/2023/10/18/ml-applications/meta-ai-custom-silicon-olivia-wu/----
    ----title: Automating_product_deprecation, url: https://engineering.fb.com/2023/10/17/data-infrastructure/automating-product-deprecation-meta/----
    ----title: Meta_contributes_new_features_to_Python_312, url: https://engineering.fb.com/2023/10/05/developer-tools/python-312-meta-new-features/----
    ----title: Meta_Quest_2_Defense_through_offense, url: https://engineering.fb.com/2023/09/12/security/meta-quest-2-defense-through-offense/----
    ----title: Using_Chakra_execution_traces_for_benchmarking_and_network_performance_optimization, url: https://engineering.fb.com/2023/09/07/networking-traffic/chakra-execution-traces-benchmarking-network-performance-op
    timization/----
    ----title: Arcadia_An_end-to-end_AI_system_performance_simulator, url: https://engineering.fb.com/2023/09/07/data-infrastructure/arcadia-end-to-end-ai-system-performance-simulator/----
    ----title: Threads_The_inside_story_of_Metas_newest_social_app, url: https://engineering.fb.com/2023/09/07/culture/threads-inside-story-metas-newest-social-app/----
    ----title: What_is_it_like_to_write_code_at_Meta, url: https://engineering.fb.com/2023/09/05/web/what-like-ship-code-meta-tech-podcast/----
    ----title: Scheduling_Jupyter_Notebooks_at_Meta, url: https://engineering.fb.com/2023/08/29/security/scheduling-jupyter-notebooks-meta/----
    ----title: Code_Llama_Metas_state-of-the-art_LLM_for_coding, url: https://ai.meta.com/blog/code-llama-large-language-model-coding/----
    ----title: Introducing_Immortal_Objects_for_Python, url: https://engineering.fb.com/2023/08/15/developer-tools/immortal-objects-for-python-instagram-meta/----
    ----title: Meta_Connect_2023_September_27__28, url: https://www.meta.com/blog/quest/connect-2023-september-27-28-menlo-park-vr-ai----
    ----title: Scaling_the_Instagram_Explore_recommendations_system, url: https://engineering.fb.com/2023/08/09/ml-applications/scaling-instagram-explore-recommendations-system/----
    ----title: How_Meta_is_improving_password_security_and_preserving_privacy, url: https://engineering.fb.com/2023/08/08/security/how-meta-is-improving-password-security-and-preserving-privacy/----
    ----title: Fixit_2_Metas_next-generation_auto-fixing_linter, url: https://engineering.fb.com/2023/08/07/developer-tools/fixit-2-linter-meta/----
    ----title: Using_short-lived_certificates_to_protect_TLS_secrets, url: https://engineering.fb.com/2023/08/07/security/short-lived-certificates-protect-tls-secrets/----
    ----title: Bringing_HDR_video_to_Reels, url: https://engineering.fb.com/2023/07/17/video-engineering/hdr-video-reels-meta/----
    ----title: Metas_Evenstar_is_transitioning_to_OCP_to_accelerate_open_RAN_adoption, url: https://engineering.fb.com/2023/06/29/connectivity/evenstar-meta-ocp-open-ran/----
    ----title: Meta_developer_tools_Working_at_scale, url: https://engineering.fb.com/2023/06/27/developer-tools/meta-developer-tools-open-source/----
    ----title: Bombyx_is_being_licensed_for_product_development, url: https://engineering.fb.com/2023/05/22/connectivity/bombyx-meta-fiber-deployment-robot-product-development/----
    ----title: MSVP_is_Metas_first_video_processing_ASIC, url: https://ai.facebook.com/blog/meta-scalable-video-processor-MSVP----
    ----title: Meta_introduces_its_first-generation_AI_inference_accelerator, url: https://ai.facebook.com/blog/meta-training-inference-accelerator-AI-MTIA----
    ----title: Building_and_deploying_MySQL_Raft_at_Meta, url: https://engineering.fb.com/2023/05/16/data-infrastructure/mysql-raft-meta/----
    ----title: The_malware_threat_landscape_NodeStealer_DuckTail_and_more, url: https://engineering.fb.com/2023/05/03/security/malware-nodestealer-ducktail/----
    ----title: A_fine-grained_network_traffic_analysis_with_Millisampler, url: https://engineering.fb.com/2023/04/17/networking-traffic/millisampler-network-traffic-analysis/----
    ----title: Deploying_key_transparency_at_WhatsApp, url: https://engineering.fb.com/2023/04/13/security/whatsapp-key-transparency/----
    ----title: How_Device_Verification_protects_your_WhatsApp_account, url: https://engineering.fb.com/2023/04/13/security/whatsapp-device-verification-protects-your-account/----
    ----title: Why_xHE-AAC_is_being_embraced_at_Meta, url: https://engineering.fb.com/2023/04/11/video-engineering/high-quality-audio-xhe-aac-codec-meta/----
    ----title: Build_faster_with_Buck2_Our_open_source_build_system, url: https://engineering.fb.com/2023/04/06/open-source/buck2-open-source-large-scale-build-system/----
    2023-11-23 19:42:49 [scrapy.core.engine] INFO: Closing spider (finished)
    2023-11-23 19:42:49 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
    {'downloader/request_bytes': 13747,
     'downloader/request_count': 5,