I'm working with nodejs and mongodb (using mongoose pkg).
I've an async function which updates a document. I'm running that function 10 times with different parameter in parallel. My question is that will mongoose/mongodb auto fetch the document again before saving if it's already saved before. Or will it throw an error?
Here's my code
const crawlSitemap = async (projectId) => {
try {
const project = await Project.findById(projectId);
const browser = await puppeteer.launch({ headless: "new" });
const crawl = (sitemap) => async () => {
const page = await browser.newPage();
page.setDefaultNavigationTimeout(120 * 1000);
console.log(sitemap.url);
if (!validator.isURL(sitemap.url) || sitemap.crawlStatus !== "queued")
return;
await page.goto(sitemap.url, { waitUntil: "networkidle0" });
await page.evaluate(() => {
const scripts = document.querySelectorAll("script");
scripts.forEach((script) => script.remove());
const images = document.querySelectorAll("img");
images.forEach((img) => img.remove());
const styles = document.querySelectorAll(
'link[rel="stylesheet"], style'
);
styles.forEach((style) => style.remove());
});
const textContent = await page.evaluate(() => document.body.textContent);
await page.close();
await promisify(fs.writeFile)(
path.join(__dirname, `tmp/${sitemap.url.replaceAll("/", "-")}.txt`),
textContent,
"utf-8"
);
sitemap.crawlStatus = "ok";
await project.save();
};
await parallelLimit(project.sitemap.map(crawl), 10);
await browser.close();
} catch (error) {
console.log(error);
}
};
Mongoose isn't designed to call save
in parallel multiple times on the same instance.
You can try instead findOneAndUpdate
every time you'd like to change sitemap status:
const crawl = (sitemap) => async () => {
const page = await browser.newPage();
// ...
// Instead of this:
//
// sitemap.crawlStatus = "ok";
// await project.save();
// Do this:
await Project.findOneAndUpdate(
{
_id: project.
'sitemap.url': sitemap.url
},
{
'sitemap.$.crawlStatus': 'ok'
}
);
};