Search code examples
node.jsexpressherokupuppeteer

Puppeteer work well locally, but not in Heroku


I am try to scrap data from instagram and serve it in an api, So i use puppeteer and all things good locally but in heroku i get timeout

and that's the whole code, any one have a solution ?

const puppeteer = require("puppeteer");
const fs = require("fs");
const ig = require("instagram-url-dl");
const cheerio = require("cheerio");
const NodeCache = require("node-cache");
const myCache = new NodeCache({ stdTTL: 5 * 60 });
const express = require("express");
const app = express();

const doc = fs.readFileSync("./doc.html", "utf-8");

let browser;
let page;

const lanuchBrowser = async () => {
  browser = await puppeteer.launch({
    headless: true,
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });
  page = await browser.newPage();
};
lanuchBrowser();

let url;

app.use(express.json());

app.get("/", (req, res) => {
  res.status(200).send(doc);
});

app.post("/download-reel", async (req, res) => {
  res.setHeader("Content-Type", "application/json");
  res.writeHead(200);
  try {
    const { reel_url } = req.body;
    url = reel_url;
    const regex = /^https:\/\/www.instagram.com\/reel\//;
    const valid = regex.test(reel_url);
    if (!reel_url) {
      throw new Error("Missing parameter: reel_url");
    } else if (!valid) {
      throw new Error("Invalid parameter: reel_url");
    }

    // Caching
    const theRes = await myCache.get(reel_url);
    if (theRes) {
      return res.write(JSON.stringify(theRes));
    }
    res.write("");
    // Get download link
    const resp = await ig(reel_url);

    // Get author, title, timestamp
    await page.goto(reel_url, {
      timeout: 60000,
      waitUntil: "domcontentloaded",
    });
    console.log("1");
    await page.waitForSelector("main", {
      timeout: 0,
    });
    console.log("2");
    const content = await page.content();
    console.log("3");
    let $ = cheerio.load(content);
    const author = $(
      "a.x1i10hfl.xjbqb8w.x6umtig.x1b1mbwd.xaqea5y.xav7gou.x9f619.x1ypdohk.xt0psk2.xe8uvvx.xdj266r.x11i5rnm.xat24cr.x1mh8g0r.xexx8yu.x4uap5.x18d9i69.xkhd6sd.x16tdsg8.x1hl2dhg.xggy1nq.x1a2a7pz._acan._acao._acat._acaw._aj1-._a6hd"
    ).text();
    $ = cheerio.load($("li._a9zj._a9zl._a9z5").html());
    console.log("4");
    const timestamp = $("time").attr("datetime");
    const title = $("h1._aacl._aaco._aacu._aacx._aad7._aade").text();

    let resObj = {
      sucess: true,
      download_link: resp.data[0].url,
      title,
      author,
      timestamp,
    };
    console.log("5");

    res.write(JSON.stringify(resObj));
    console.log("6");
    myCache.set(reel_url, resObj);
  } catch (err) {
    console.log(err);
    err.message == "Missing parameter: reel_url"
      ? err.message
      : (err.message = "Invalid parameter: reel_url");
    res.write(
      JSON.stringify({
        sucess: false,
        error_message: err.message,
      })
    );
  } finally {
    res.end();
  }
});

app.listen(process.env.PORT || 3000, () => {
  console.log("Lanuch the app ....");
});

I tried add {timeout: 0} i got no response from the server, So i tried to figure out where is the issue so i console.log number after each operation and when i run heroku logs --tail

I found out that only number 1 is print in the console

So the timeout happend wih this line of code

await page.waitForSelector("main", { timeout: 0, });


Solution

  • I just uses this buildpack and it worked

    https://github.com/jontewks/heroku-buildpack-puppeteer-firefox