Search code examples
pythonweb-scrapingscrapyscreen-scraping

Scrapy.Request returns <GET url> without scraping anything


I wanted to scrape the feed of sitepoint.com, this is my code:

import scrapy
from urllib.parse import urljoin


class SitepointSpider(scrapy.Spider):
    # TODO: Add url tags (like /javascript) to the spider based on class paraneters
    name = "sitepoint"
    allowed_domains = ["sitepoint.com"]
    start_urls = ["http://sitepoint.com/javascript/"]

    def parse(self, response):
        data = []
        for article in response.css("article"):
            title = article.css("a.t12xxw3g::text").get()
            href = article.css("a.t12xxw3g::attr(href)").get()
            img = article.css("img.f13hvvvv::attr(src)").get()
            time = article.css("time::text").get()
            url = urljoin("https://sitepoint.com", href)
            text = scrapy.Request(url, callback=self.parse_article)

            data.append(
                {"title": title, "href": href, "img": img, "time": time, "text": text}
            )
        yield data

    def parse_article(self, response):
        text = response.xpath(
           '//*[@id="main-content"]/article/div/div/div[1]/section/text()'
        ).extract()
        yield text

And this is the response I get:-

[{'title': 'How to Build an MVP with React and Firebase', 
'href': '/react-firebase-build-mvp/', 
'img': 'https://uploads.sitepoint.com/wp-content/uploads/2021/09/1632802723react-firebase-mvp- 
app.jpg', 
'time': 'September 28, 2021', 
'text': <GET https://sitepoint.com/react-firebase-build-mvp/>}]

It just does not scrape the urls. I followed everything said in this question but still could not make it work.


Solution

  • You have to visit the detail page from the listing to scrape the article.

    In that case you have to yield the URL first then yield the data in the last spider

    Also, the //*[@id="main-content"]/article/div/div/div[1]/section/text() won't return you any text since there are lots of HTML elements under the section tag

    One solution is you can scrape all the HTML element inside section tag and clean them later to get your article text data

    here is the full working code

    import re
    
    import scrapy
    from urllib.parse import urljoin
    
    
    class SitepointSpider(scrapy.Spider):
        # TODO: Add url tags (like /javascript) to the spider based on class paraneters
        name = "sitepoint"
        allowed_domains = ["sitepoint.com"]
        start_urls = ["http://sitepoint.com/javascript/"]
    
        def clean_text(self, raw_html):
            """
            :param raw_html: this will take raw html code
            :return: text without html tags
            """
            cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
            return re.sub(cleaner, '', raw_html)
    
        def parse(self, response):
            for article in response.css("article"):
                title = article.css("a.t12xxw3g::text").get()
                href = article.css("a.t12xxw3g::attr(href)").get()
                img = article.css("img.f13hvvvv::attr(src)").get()
                time = article.css("time::text").get()
                url = urljoin("https://sitepoint.com", href)
                yield scrapy.Request(url, callback=self.parse_article, meta={"title": title,
                                                                             "href": href,
                                                                             "img": img,
                                                                             "time": time})
    
        def parse_article(self, response):
            title = response.request.meta["title"]
            href = response.request.meta["href"]
            img = response.request.meta["img"]
            time = response.request.meta["time"]
            all_data = {}
            article_html = response.xpath('//*[@id="main-content"]/article/div/div/div[1]/section').get()
            all_data["title"] = title
            all_data["href"] = href
            all_data["img"] = img
            all_data["time"] = time
            all_data["text"] = self.clean_text(article_html)
    
            yield all_data