Search code examples
javascriptnode.jsweb-scrapingcheerio

cheerio not getting response back after web scraping


I wanted to scrape a website's data, so I tried it using cheerio npm package
The selector works perfectly fine in chrome dev tools

  let commodity_array = $(
    "#tdm_base_scroll > div > div.dt_ta_09 > div.dt_ta_10"
  )
    .text()
    .split("\n");

  console.log(commodity_array);

screenshot

But while using in my code it return empty response
My Code:

const request = require("request-promise"),
  cheerio = require("cheerio"),
  fs = require("fs"),
  json2csv = require("json2csv").Parser;

const url = "https://www.commodityonline.com/mandiprices/";

(async () => {
  let mandiData = [];
  const response = await request({
    uri: url,
    headers: {
      accept:
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
      "accept-encoding": "gzip, deflate, br",
      "accept-language": "en-IN,en-GB;q=0.9,en-US;q=0.8,en;q=0.7,la;q=0.6",
    },
    gzip: true,
  });

  let $ = cheerio.load(response);
  
  let commodity_array = $(
    "#tdm_base_scroll > div > div.dt_ta_09 > div.dt_ta_10"
  )
    .text()
    .split("\n");

  console.log(commodity_array);
})();

enter image description here

Website url from where I am scraping data is: https://www.commodityonline.com/mandiprices/

I learnt about this method of scraping from hitesh chaudhary youtube channel, this video
Is there any problem with the request headers,
I am new to web scraping so, I don't get it what step I am doing wrong


Solution

  • In the http headers, you've specified "accept-encoding": "gzip, deflate, br" which means you want the request result to be compressed as gzip. Cheerio is expecting text and thus can't parse the response data. Just removing that header makes it work :

    const request = require("request-promise"),
        cheerio = require("cheerio");
    
    const url = "https://www.commodityonline.com/mandiprices/";
    
    (async () => {
        const response = await request({
            uri: url,
            headers: {
                accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                "accept-language": "en-IN,en-GB;q=0.9,en-US;q=0.8,en;q=0.7,la;q=0.6",
            }
        });
        let $ = cheerio.load(response);
    
        let commodity_array = $(
            "#tdm_base_scroll > div > div.dt_ta_09 > div.dt_ta_10"
        )
            .text()
            .split("\n");
        console.log(commodity_array);
    })();
    

    Note that request is deprecated. One good alternative is axios :

    const axios = require("axios"),
        cheerio = require("cheerio");
    
    const url = "https://www.commodityonline.com/mandiprices/";
    
    (async () => {
        const response = await axios.get(url);
        let $ = cheerio.load(response.data);
        data = []
        $("#tdm_base_scroll > div > div.dt_ta_09").each(function (i, elm) {
            var price = $("div.dt_ta_14", elm)
            data.push({
                commodity: $("div.dt_ta_10", elm).text().trim(),
                marketCenter: $("div.dt_ta_11", elm).text().trim(),
                variety: $("div.dt_ta_12", elm).text().trim(),
                arrrivals: $("div.dt_ta_13", elm).text().trim(),
                modalPrice: $(price[0]).text().trim(),
                minMaxPrice: $(price[1]).text().trim()
            })
        });
        console.log(data);
    })();