Apify can crawl links from sitemap.xml
const Apify = require('apify');
Apify.main(async () => {
const requestList = new Apify.RequestList({
sources: [{ requestsFromUrl: 'https://edition.cnn.com/sitemaps/cnn/news.xml' }],
});
await requestList.initialize();
const crawler = new Apify.PuppeteerCrawler({
requestList,
handlePageFunction: async ({ page, request }) => {
console.log(`Processing ${request.url}...`);
await Apify.pushData({
url: request.url,
title: await page.title(),
html: await page.content(),
});
},
});
await crawler.run();
console.log('Done.');
});
https://sdk.apify.com/docs/examples/puppeteersitemap#docsNav
However, I am not sure how to crawl links from sitemap.xml if I am using requestQueue. For ex:
const requestQueue = await Apify.openRequestQueue();
await requestQueue.addRequest({url: "https://google.com});
//this is not working. Apify is simply crawling sitemap.xml
//and not adding urls from sitemap.xml to requestQueue
await requestQueue.addRequest({url:`https://google.com/sitemap.xml`});
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
// This function is called for every page the crawler visits
handlePageFunction: async (context) => {
const {request, page} = context;
const title = await page.title();
let page_url = request.url;
console.log(`Title of ${page_url}: ${title}`);
await Apify.utils.enqueueLinks({
page, selector: 'a', pseudoUrls, requestQueue});
},
});
await crawler.run();
The great thing about Apify is that you can use both RequestList
and RequestQueue
together. In that case, items are taken from the list to the queue as you scrape (not overloading the queue). By using both, you get the best from both worlds.
Apify.main(async () => {
const requestList = new Apify.RequestList({
sources: [{ requestsFromUrl: 'https://edition.cnn.com/sitemaps/cnn/news.xml' }],
});
await requestList.initialize();
const requestQueue = await Apify.openRequestQueue();
const crawler = new Apify.PuppeteerCrawler({
requestList,
requestQueue,
handlePageFunction: async ({ page, request }) => {
console.log(`Processing ${request.url}...`);
// This is just an example, define your logic
await Apify.utils.enqueueLinks({
page, selector: 'a', pseudoUrls: null, requestQueue,
});
await Apify.pushData({
url: request.url,
title: await page.title(),
html: await page.content(),
});
},
});
await crawler.run();
console.log('Done.');
});
If you want to use just the queue, you will need to parse the XML yourself. Of course, this is not a big issue. You can parse it easily with Cheerio either before the crawler or by using Apify.CheerioCrawler
Anyway, we recommend using RequestList
for bulk urls because it is basically instantly created in-memory but the queue is actually a database (or JSON files locally).