python selenium-webdriver web-scraping scrapy

"Load More" Button for Webscraping using scrapy and selenium [EDIT]

I'm currently attempting to scrape articles from the website NepalItimes. The challenge I'm facing is that the website employs a "Load More" button, which I need to click to load additional articles. However, my scraping process successfully retrieves the initial page with the first six articles, but it fails to click the "Load More" button to load the rest of the articles. As a result, I am unable to scrape anything beyond the initial six articles.

Furthermore, during the scraping process, it continues to fetch URLs, but instead of obtaining the desired content, it returns "oops" pages, indicating a problem with the Selenium and button-clicking functionality.

If someone could explain to me how can I handle this? I would be really grateful!

import scrapy
import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http.request import Request

class NepaliSpider(CrawlSpider):
    name = "nepalitimes"
    allowed_domains = ["nepalitimes.com"]
    # Start URL for the spider
    start_urls = ['https://www.nepalitimes.com/news']

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'nepali_times.csv'
    }

    # Rule to follow links to individual article pages
    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

# Handling the load button using Selenium --- En cours de pulvérisation <3
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, callback=self.parse)

    def parse(self, response, **kwargs):
        # Parse the articles from the initial page
        for result in response.xpath(".//div[contains(@class,'main--left')]/a"):
            relative_url = result.xpath("@href").extract_first()
            absolute_url = response.urljoin(relative_url)
            yield scrapy.Request(url=absolute_url, callback=self.parse_item)

        # Check if there is a "Load More" button
        load_more_button = response.xpath(".//button[contains(@class, 'btn btn--load center') and contains(., 'load more')]")
        if load_more_button:
            print("Load more button detected")
            tenant_code = "epz639"
            routeId = 8
            limit = 10
            offset = 10  

            # Prepare the data payload for the POST request
            data = {
                "query": "query getMoreArticles($tenant_code: String, $routeId: Int, $limit: Int, $offset: Int) { articles: getPublicContent(tenant_code: $tenant_code, routeId: $routeId, limit: $limit, offset: $offset) { id } }",
                "variables": {
                    "tenant_code": tenant_code,
                    "routeId": routeId,
                    "limit": limit,
                    "offset": offset
                }
            }

            # Send a POST request to the endpoint using scrapy.FormRequest
            yield scrapy.FormRequest(url="https://nepalitimes-hasura.superdesk.org/v1/graphql",
                                     formdata={"query": json.dumps(data["query"]), "variables": json.dumps(data["variables"])},
                                     headers={"Content-Type": "application/json"},
                                     callback=self.parse_ajax_response)
            print("Post resquest sent")

    def parse_ajax_response(self, response):
        if 'data' in json_response and 'articles' in json_response['data']:
            articles = json_response['data']['articles']
            print("Articles :", articles)
            for article in articles:
                # Assuming there's an 'slug' field in the response representing the article slug
                article_slug = article['slug']
                article_url = f"https://www.nepalitimes.com/news/{article_slug}"  # Adjust this based on the actual URL structure
                yield scrapy.Request(url=article_url, callback=self.parse_item)

    def parse_item(self, response):
        # This function should extract the article information from the provided response
        # and yield the scraped data as a dictionary

        # Extract article information using XPath selectors
        title = response.xpath('.//article[contains(@class,"article__full")]/h1/text()').get()
        subtitle = response.xpath('.//span[contains(@class,"article__subhead")]/text()').get()
        date = response.xpath(".//div/time[contains(@class,'article__time')]/text()").get()
        author = response.xpath('.//div/span[contains(@class,"article__author")]/span/text()').get()
        category = response.xpath(".//a[contains(@class,'active')]/text()").get()
        url = response.xpath(".//meta[contains(@property, 'og:url')]/@content").get()

        # Parse the HTML content
        content_elements = response.xpath('.//div[contains(@class,"article__text")]/p')
        text_content = [element.xpath("string(.)").get().strip() for element in content_elements]
        cleaned_content = ' '.join(text_content)

        yield {
            'title': title,
            'subtitle': subtitle,
            'author': author,
            'date': date,
            'content': cleaned_content,
            'category': category,
            'URL': url
        }

Okay, So I tried what @Leandro suggested, that is to say, using chrom devtools instead of Selenium, but it doesn't seem to launch the def parse_ajax function.... but it still functionning not giving the results I want (only 9 items were scrapped). I need some help.

Here is what Iget when I'm clicking on the "Load button" : and

Here is the edited code :

import scrapy
import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http.request import Request

class NepaliSpider(CrawlSpider):
    name = "nepalitimes"
    allowed_domains = ["nepalitimes.com"]
    # Start URL for the spider
    start_urls = ['https://www.nepalitimes.com/news']

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'nepali_times.csv'
    }

    # Rule to follow links to individual article pages
    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

# Handling the load button using Selenium --- En cours de pulvérisation <3

    def parse(self, response, **kwargs):
        # Parse the articles from the initial page
        for result in response.xpath(".//div[contains(@class,'main--left')]/a"):
            relative_url = result.xpath("@href").extract_first()
            absolute_url = response.urljoin(relative_url)
            yield scrapy.Request(url=absolute_url, callback=self.parse_item)

        # Fetch additional articles using GraphQL API with different offset values
        tenant_code = "epz639"
        routeId = 8
        limit = 10
        offset = 10

        while True:
            data = {
                "query": "query getMoreArticles($tenant_code: String, $routeId: Int, $limit: Int, $offset: Int) { articles: getPublicContent(tenant_code: $tenant_code, routeId: $routeId, limit: $limit, offset: $offset) { id } }",
                "variables": {
                    "tenant_code": tenant_code,
                    "routeId": routeId,
                    "limit": limit,
                    "offset": offset
                }
            }

            yield scrapy.Request(url="https://nepalitimes-hasura.superdesk.org/v1/graphql",
                                 method='POST',
                                 body=json.dumps(data),
                                 headers={'Content-Type': 'application/json'},
                                 callback=self.parse_ajax_response)

            offset += limit

    def parse_ajax_response(self, response):
        json_response = json.loads(response.text)
        if 'items' in json_response:
            articles = json_response['data']['items']
            print("DAta found", articles)
            for article in articles:
                article_id = article['id']
                article_url = f"https://www.nepalitimes.com/news/{article_id}"
                yield scrapy.Request(url=article_url, callback=self.parse_item)

    def parse_item(self, response):
        # This function should extract the article information from the provided response
        # and yield the scraped data as a dictionary

        # Extract article information using XPath selectors
        title = response.xpath('.//article[contains(@class,"article__full")]/h1/text()').get()
        subtitle = response.xpath('.//span[contains(@class,"article__subhead")]/text()').get()
        date = response.xpath(".//div/time[contains(@class,'article__time')]/text()").get()
        author = response.xpath('.//div/span[contains(@class,"article__author")]/span/text()').get()
        category = response.xpath(".//a[contains(@class,'active')]/text()").get()
        url = response.xpath(".//meta[contains(@property, 'og:url')]/@content").get()

        # Parse the HTML content
        content_elements = response.xpath('.//div[contains(@class,"article__text")]/p')
        text_content = [element.xpath("string(.)").get().strip() for element in content_elements]
        cleaned_content = ' '.join(text_content)

        yield {
            'title': title,
            'subtitle': subtitle,
            'author': author,
            'date': date,
            'content': cleaned_content,
            'category': category,
            'URL': url
        }

It do load other pages (not only focusing on the news pages) and it seems not taking into account the def parse_ajax_response() function... Furthermore, it tries to scrape the https://archive.nepalitimes.com/news structure but I don't want the script to do it...

Solution

I think the best approach is to check out what request is being issued when you click the "Load More" button. For instance, this can be done using the Network tab in Chrome Dev Tools. Then, you can schedule this request in Scrapy after loading the first page. Probably, this request will return some JSON-like structure, which you can handle in a different method (see the callback argument in the Request object).

This way, you can get rid of Selenium, making your scraper lighter. I hope this helps :)

For your case, it is using a GraphQL API for querying more objects. The request may seem a bit scary, but it states what data should be returned from the server:

If you take a look in the Response tab, you'll see that the response looks like:

So, you should add to your scraper a yield Request(...) in the scrape method that mimics the images I've sent. So, your request would have a body with an attribute named query with the string you can see on Chrome Dev Tools, and also a variables param which is a JSON with bindings to the query parameters. (You can check in the Payload tab and click view source for the actual string being sent).

You'll probably have to do this (yield Request(...)) as many times as pages you want to crawl, accounting for the limit and offset parameters. You can also check what happens when you hit the last page.

A tip: you can have a parse_first_load_more method for the first request. The response comes with a "totalCount": 1321, which you can use to calculate how many requests you have to issue. Then, the following request can have a different callback or you can use a meta parameter in the request to indicate that this is not the first one....

The final result would be something like this (NOTE that this is just an example code):

import json
import scrapy


GRAPHQL_QUERY = """
    query getArticles($tenant_code: String = \"\", $routeId: Int, $limit: Int = 10, $offset: Int = 0) {
        metadata: swp_article_aggregate(where: {tenant_code: {_eq: $tenant_code}, route_id: {_eq: $routeId}}) {
            aggregate {
                totalCount: count
            }
        }
        items: swp_article(limit: $limit, offset: $offset, order_by: {published_at: desc}, where: {tenant_code: {_eq: $tenant_code}
     ...
"""


class NepalTimesScraper(scrapy.Spider):
    name = "nepaltimes"
    start_urls = ["https://www.nepalitimes.com/news"]

    def parse(self, response):
        articles = response.xpath("//article[@class='list']/..")

        for article in articles:
            title = article.css("h3::text").get()
            link = article.attrib["href"]

            yield {"title": title, "link": link}

        # Now, load more
        graphql_req = {
            "query": GRAPHQL_QUERY,
            "variables": {
                "tenant_code": "epz639",
                "routeId": 8,
                "limit": 10,
                "offset": 10,
            },
        }

        yield scrapy.Request(
            "https://nepalitimes-hasura.superdesk.org/v1/graphql",
            method="POST",
            body=json.dumps(graphql_req),
            meta={"current_offset": 10},
            callback=self.parse_more,
        )

    def parse_more(self, response):
        json_response = json.loads(response.text)
        total_number_of_articles = json_response["data"]["metadata"]["aggregate"][
            "totalCount"
        ]
        current_offset = response.meta["current_offset"]

        for article in json_response["data"]["items"]:
            yield {
                "title": article["title"],
                "link": f"{article['swp_route']['staticprefix']}/{article['slug']}",
            }

        if current_offset * 10 < total_number_of_articles:
            current_offset = current_offset + 10

            graphql_req = {
                "query": GRAPHQL_QUERY,
                "variables": {
                    "tenant_code": "epz639",
                    "routeId": 8,
                    "limit": 10,
                    "offset": current_offset,
                },
            }
            yield scrapy.Request(
                "https://nepalitimes-hasura.superdesk.org/v1/graphql",
                method="POST",
                body=json.dumps(graphql_req),
                meta={"current_offset": current_offset},
                callback=self.parse_more,
            )

Hope this helps