Search code examples
node.jsweb-scrapingcheerio

Is there a way to capture/scrape an entire table instead of cell by cell with the Cheerio library?


My scraper functions runs in O(n^2), takes 6 seconds to execute and I'm looking at ways to optimize.

The source site I'm scraping is www.rate.am/en. Screen shot below enter image description here

Scraper function

const rp = require("request-promise");
const $ = require("cheerio");
const url = "http://rate.am/en/armenian-dram-exchange-rates/banks/non-cash";

const TABLE_ROW_IDS = [
  "69460818-02ec-456e-8d09-8eeff6494bce",
  "0fffdcc4-8e36-49f3-9863-93ad02ce6541",
  "65351947-217c-4593-9011-941b88ee7baf",
  "8e9bd4c8-6f4a-4663-ae86-b8fbaf295030",
  "ebd241ce-4a38-45a4-9bcd-c6e607079706",
  "466fe84c-197f-4174-bc97-e1dc7960edc7",
  "5ee70183-87fe-4799-802e-ef7f5e7323db",
  "f3ffb6cf-dbb6-4d43-b49c-f6d71350d7fb",
  "b5bb13d2-8a79-43a8-a538-ffd1e2e21009",
  "db08ff22-add9-45ea-a450-1fe5b1993704",
  "2119a3f1-b233-4254-a450-304a2a5bff19",
  "989ba942-a5cf-4fc2-b62e-3248c4edfbbc",
  "e1a68c2e-bc47-4f58-afd2-3b80a8465b14",
  "332c7078-97ad-4bf7-b8ee-44d85a9c88d1",
  "133240fd-5910-421d-b417-5a9cedd5f5f7"
];

const rateScraper = () => {
  return (
    rp(url)
      .then(url => {
        let resultsArr = [];
        for (let i = 0; i < TABLE_ROW_IDS.length; i++) {
          let currencyArr = [];
          let bankName = $(
            `tbody > tr[id=${TABLE_ROW_IDS[i]}] > td:nth-child(2) > a`,
            url
          ).text();

          for (let j = 6; j <= 13; j++) {
            currencyArr.push(
              $(
                `tbody > tr[id=${TABLE_ROW_IDS[i]}] > td:nth-child(${j})`,
                url
              ).text()
            );
          }
          resultsArr.push({
            bankName,
            usd: { buy: currencyArr[0], sell: currencyArr[1] },
            eur: { buy: currencyArr[2], sell: currencyArr[3] },
            rub: { buy: currencyArr[4], sell: currencyArr[5] },
            gbp: { buy: currencyArr[6], sell: currencyArr[7] }
          });
        }
        return resultsArr;
      })
      .catch(error => {
        console.log(error);
      })
  );
};
module.exports = rateScraper;

The TABLE_ROW_IDS are a constant that are unique to each row. The first loop iterates over each row and the second loop extracts the individual rates in each cell.

I suspect that the dozens of Cheerio calls are expensive and I'm looking for a way to extract the entire table all at once to parse.

Edit: Optimized code

const rateScraper = () => {
  let start = Date.now();

  return got(url)
    .then((response) => {
      let resultsArr = [];
      const $ = cheerio.load(response.body);
      $("#rb > tbody > tr")
        .filter((i, el) => TABLE_ROW_IDS.includes(el.attribs.id))
        .each((i, elem) => {
          const cells = $(elem).find("td");
          resultsArr.push({
            bankName: cells.eq(1).text(),
            usd: { buy: cells.eq(5).text(), sell: cells.eq(6).text() },
            eur: { buy: cells.eq(7).text(), sell: cells.eq(8).text() },
            rub: { buy: cells.eq(9).text(), sell: cells.eq(10).text() },
            gbp: { buy: cells.eq(11).text(), sell: cells.eq(12).text() },
          });
        });
      let end = Date.now();
      console.log("time", end - start);
      return resultsArr;
    })
    .catch((error) => {
      console.log(error);
    });
};
module.exports = rateScraper;

Solution

  • So the main thing in my opinion is that you don't want to make it traverse the whole document to find each thing, you want to find the rows, and then traverse just the row to get each cell. Currently each time you do

    $(
        `tbody > tr[id=${TABLE_ROW_IDS[i]}] > td:nth-child(${j})`,
        url
    ).text()
    

    you have to traverse the whole thing.

    In pure JS on the page, you could do:

    Array.from(
        document.getElementById('rb').querySelectorAll('tr')
    ).filter((a) => TABLE_ROW_IDS.includes(a.id))
    .map((row) => {
        const cells = row.querySelectorAll('td');
        return { 
            bankName: cells[1].innerText, 
            usd: { buy: cells[5].innerText, sell: cells[6].innerText },
            eur: { buy: cells[7].innerText, sell: cells[8].innerText },
            rub: { buy: cells[9].innerText, sell: cells[10].innerText },
            gbp: { buy: cells[11].innerText, sell: cells[12].innerText }
        };
    });
    

    So translate to cheerio? I'm guessing it would be something like:

    .then(url => {
        const results = [];
        $('#rd > tr', url).filter(() => TABLE_ROW_IDS.includes(this.id)).each((i, elem) => {
            const cells = $(this).find('td');
            results.push({ 
                bankName: cells.eq(1).text(), 
                usd: { buy: cells.eq(5).text(), sell: cells.eq(6).text() },
                eur: { buy: cells.eq(7).text(), sell: cells.eq(8).text() },
                rub: { buy: cells.eq(9).text(), sell: cells.eq(10).text() },
                gbp: { buy: cells.eq(11).text(), sell: cells.eq(12).text() }
            });
        });