I am trying to write a program to scan multiple URLs at the same time (parallelizaiton) and I have extracted sitemap and stored it as an array in a Variable as shown below. But i am unable to open using Puppeteer. I am getting the below error:
originalMessage: 'Cannot navigate to invalid URL'
My code below. Can someone please help me out .
const sitemapper = require('@mastixmc/sitemapper');
const SitemapXMLParser = require('sitemap-xml-parser');
const url = 'https://edition.cnn.com/sitemaps/sitemap-section.xml';
/*If sitemapindex (link of xml or gz file) is written in sitemap, the URL will be accessed.
You can optionally specify the number of concurrent accesses and the number of milliseconds after processing and access to resume processing after a delay.
*/
const options = {
delay: 3000,
limit: 50000
};
const sitemapXMLParser = new SitemapXMLParser(url, options);
sitemapXMLParser.fetch().then(result => {
var locs = result.map(value => value.loc)
var locsFiltered = locs.toString().replace("[",'<br>');
const urls = locsFiltered
console.log(locsFiltered)
const puppeteer = require("puppeteer");
async function scrapeProduct(url) {
const urls = locsFiltered
const browser = await puppeteer.launch({
headless: false
});
for (i = 0; i < urls.length; i++) {
const page = await browser.newPage();
const url = urls[i];
const promise = page.waitForNavigation({
waitUntil: "networkidle2"
});
await page.goto(`${url}`);
}};
scrapeProduct();
});
You see invalid URL because you've convert an array into URL string by wrong method.
These line is a better one:
// var locsFiltered = locs.toString().replace("[",'<br>') // This is wrong
// const urls = locsFiltered // So value is invalid
// console.log(locsFiltered)
const urls = locs.map(value => value[0]) // This is better
So to scrape CNN sites, i've added puppeteer-cluster
for speed:
const { Cluster } = require('puppeteer-cluster')
const sitemapper = require('@mastixmc/sitemapper')
const SitemapXMLParser = require('sitemap-xml-parser')
const url = 'https://edition.cnn.com/sitemaps/sitemap-section.xml'
async function scrapeProduct(locs) {
const urls = locs.map(value => value[0])
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 2, // You can set this to any number you like
puppeteerOptions: {
headless: false,
devtools: false,
args: [],
}
})
await cluster.task(async ({ page, data: url }) => {
await page.goto(url, {timeout: 0, waitUntil: 'networkidle2'})
const screen = await page.screenshot()
// Store screenshot, do something else
})
for (i = 0; i < urls.length; i++) {
console.log(urls[i])
await cluster.queue(urls[i])
}
await cluster.idle()
await cluster.close()
}
/******
If sitemapindex (link of xml or gz file) is written in sitemap, the URL will be accessed.
You can optionally specify the number of concurrent accesses and the number of milliseconds after processing and access to resume processing after a delay.
*******/
const options = {
delay: 3000,
limit: 50000
}
const sitemapXMLParser = new SitemapXMLParser(url, options)
sitemapXMLParser.fetch().then(async result => {
var locs = result.map(value => value.loc)
await scrapeProduct(locs)
})