I am try to scrap data from instagram and serve it in an api, So i use puppeteer and all things good locally but in heroku i get timeout
and that's the whole code, any one have a solution ?
const puppeteer = require("puppeteer");
const fs = require("fs");
const ig = require("instagram-url-dl");
const cheerio = require("cheerio");
const NodeCache = require("node-cache");
const myCache = new NodeCache({ stdTTL: 5 * 60 });
const express = require("express");
const app = express();
const doc = fs.readFileSync("./doc.html", "utf-8");
let browser;
let page;
const lanuchBrowser = async () => {
browser = await puppeteer.launch({
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
page = await browser.newPage();
};
lanuchBrowser();
let url;
app.use(express.json());
app.get("/", (req, res) => {
res.status(200).send(doc);
});
app.post("/download-reel", async (req, res) => {
res.setHeader("Content-Type", "application/json");
res.writeHead(200);
try {
const { reel_url } = req.body;
url = reel_url;
const regex = /^https:\/\/www.instagram.com\/reel\//;
const valid = regex.test(reel_url);
if (!reel_url) {
throw new Error("Missing parameter: reel_url");
} else if (!valid) {
throw new Error("Invalid parameter: reel_url");
}
// Caching
const theRes = await myCache.get(reel_url);
if (theRes) {
return res.write(JSON.stringify(theRes));
}
res.write("");
// Get download link
const resp = await ig(reel_url);
// Get author, title, timestamp
await page.goto(reel_url, {
timeout: 60000,
waitUntil: "domcontentloaded",
});
console.log("1");
await page.waitForSelector("main", {
timeout: 0,
});
console.log("2");
const content = await page.content();
console.log("3");
let $ = cheerio.load(content);
const author = $(
"a.x1i10hfl.xjbqb8w.x6umtig.x1b1mbwd.xaqea5y.xav7gou.x9f619.x1ypdohk.xt0psk2.xe8uvvx.xdj266r.x11i5rnm.xat24cr.x1mh8g0r.xexx8yu.x4uap5.x18d9i69.xkhd6sd.x16tdsg8.x1hl2dhg.xggy1nq.x1a2a7pz._acan._acao._acat._acaw._aj1-._a6hd"
).text();
$ = cheerio.load($("li._a9zj._a9zl._a9z5").html());
console.log("4");
const timestamp = $("time").attr("datetime");
const title = $("h1._aacl._aaco._aacu._aacx._aad7._aade").text();
let resObj = {
sucess: true,
download_link: resp.data[0].url,
title,
author,
timestamp,
};
console.log("5");
res.write(JSON.stringify(resObj));
console.log("6");
myCache.set(reel_url, resObj);
} catch (err) {
console.log(err);
err.message == "Missing parameter: reel_url"
? err.message
: (err.message = "Invalid parameter: reel_url");
res.write(
JSON.stringify({
sucess: false,
error_message: err.message,
})
);
} finally {
res.end();
}
});
app.listen(process.env.PORT || 3000, () => {
console.log("Lanuch the app ....");
});
I tried add {timeout: 0} i got no response from the server, So i tried to figure out where is the issue so i console.log number after each operation and when i run heroku logs --tail
I found out that only number 1 is print in the console
So the timeout happend wih this line of code
await page.waitForSelector("main", { timeout: 0, });
I just uses this buildpack and it worked
https://github.com/jontewks/heroku-buildpack-puppeteer-firefox