I want to become acquainted with javascript and Puppeteer. So consider this a practice example. I managed to put a script in Puppeteer together (for learning purposes) that fetches the innerText from all the 4 given class names within my HTML code block.For the most parts the script runs and works. The class names are:
class="fc-item__kicker"
class="js-headline-text"
link a href
class="fc-item__standfirst"
The problem is that there are several instances of the same selectors.
This means that I can extract only the innertext after the first instance but I can't fetch the innertext after the second instance. How can I accomplich this?
To train myself I'll use the frontpage of The Guardian because it has deep and complicated nested html tags and classes.
This is a small part of the HTML code block:
<!DOCTYPE html>
<html>
<body>
<div class="l-side-margins">
<div class="facia-page">
<section id="headlines" class="fc-container fc-container--has-toggle">
<div class="fc-container__inner">
<div class="fc-container--rolled-up-hide fc-container__body" id="container-10f21d96-18f6-426f-821b-19df55dfb831">
<div class="fc-slice-wrapper">
<ul class="u-unstyled l-row l-row--cols-4 fc-slice fc-slice--qqq-q">
<li class="fc-slice__item l-row__item l-row__item--span-3 u-faux-block-link">
<div class="fc-item__container">
<div class="fc-item__content">
<div class="fc-item__header">
<h3 class="fc-item__title"><a class="fc-item__link" href="https://www.example.com"><span class="fc-item__kicker">Monterey Park shooting</span> <span class="u-faux-block-link__cta fc-item__headline"><span class="js-headline-text">Beloved dance hall manager named among victims</span></span></a></h3>
</div>
<div class="fc-item__standfirst-wrapper">
<div class="fc-item__standfirst">
California officials yet to identify eight others who died in Saturday attack, at least 36th mass shooting in US so far this year
</div>
</div>
<div class="fc-item__footer--vertical">
<ul class="fc-sublinks u-unstyled u-faux-block-link__promote">
<li class="fc-sublink fc-sublink--pillar-news fc-sublink--type-article">
<h4 class="fc-sublink__title"><a class="fc-sublink__link" href="https://www.example.com"><span class="fc-sublink__kicker">LA mass shooting</span> Man who disarmed California shooter tells of violent struggle for gun</a></h4>
</li>
</ul>
</div>
</div>
</div>
</li>
</ul>
</div>
<div class="fc-slice-wrapper">
<ul class="u-unstyled l-row l-row--cols-4 fc-slice fc-slice--q-q-ql-ql">
<li class="fc-slice__item l-row__item l-row__item--span-1 u-faux-block-link">
<div class="fc-item fc-item--has-image fc-item--pillar-news fc-item--type-article js-fc-item fc-item--list-media-mobile fc-item--standard-tablet js-snappable">
<div class="fc-item__container">
<div class="fc-item__media-wrapper">
<div class="fc-item__image-container u-responsive-ratio"></div>
</div>
<div class="fc-item__content">
<div class="fc-item__header">
<h3 class="fc-item__title"><a class="fc-item__link" href="https://www.example.com"><span class="fc-item__kicker">Germany</span> <span class="u-faux-block-link__cta fc-item__headline"><span class="js-headline-text">Five charged over second alleged far-right plot against government</span></span></a></h3>
</div>
<div class="fc-item__standfirst-wrapper">
<div class="fc-item__standfirst">
Four men and a woman accused of planning to abduct health minister and overthrow government
</div>
<div class="fc-item__meta js-item__meta"></div>
</div>
</div>
</div>
</div>
</li>
</ul>
</div>
</div>
</div>
</section>
</div>
</div>
</body>
</html>
This is my script
const fs = require('fs');
const puppeteer = require('puppeteer');
async function run() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.theguardian.com/international/');
const headlines = await page.evaluate(() => Array.from(document.querySelectorAll('#headlines'), (e) => ({
kicker: e.querySelector('.fc-item__header .fc-item__kicker').innerText,
headline: e.querySelector('.fc-item__header .js-headline-text').innerText,
link: e.querySelector('.fc-item__header a').href,
standfirst: e.querySelector('.fc-item__standfirst-wrapper .fc-item__standfirst').textContent.replaceAll(" ", " ").trim(),
})));
console.log(headlines);
console.log(headlines);
// Save data to JSON file
fs.writeFile('headlines.json', JSON.stringify(headlines), (err) => {
if (err) throw err;
console.log('File saved');
});
await browser.close();
}
run();
This is the desired result:
[
{
kicker: 'Monterey Park shooting',
headline: 'Beloved dance hall manager named among victims'',
link: 'https://www.example.com',
standfirst: 'California officials yet to identify eight others who died in Saturday attack, at least 36th mass shooting in US so far this year'
}
{
kicker: 'Germany'
headline: 'Five charged over second alleged far-right plot against government'
link: 'https://www.example.com'
standfirst: 'Four men and a woman accused of planning to abduct health minister and overthrow government'
}
]
I don't see the #header
element you show in the markup served when I visit the site, but the basic problem is that you're looping over a single wrapper on all of the articles rather than each of the articles. IDs are unique in almost all (valid) websites, so there's generally no point to trying to loop over an array that's nearly guaranteed to be at most one item.
Try adding .fc-item__container
to your #headline
selector: #headline .fc-item__container
, or just .fc-item__container
as shown below.
const fs = require("node:fs/promises");
const puppeteer = require("puppeteer"); // ^19.4.1
const url = "<Your URL>";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.setJavaScriptEnabled(false);
await page.setRequestInterception(true);
page.on("request", req => {
if (req.url() !== url) {
req.abort();
}
else {
req.continue();
}
});
await page.goto(url, {waitUntil: "domcontentloaded"});
const data = await page.$$eval(".fc-item__container", els =>
els.map(e => {
const text = s => e.querySelector(s)?.textContent.trim();
return {
kicker: text(".fc-item__header .fc-item__kicker"),
headline: text(".fc-item__header .js-headline-text"),
link: e.querySelector(".fc-item__header a").getAttribute("href"),
standfirst: text(".fc-item__standfirst-wrapper .fc-item__standfirst"),
};
})
);
await fs.writeFile("headlines.json", JSON.stringify(data, null, 2));
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Since the data is in the static HTML, we can block all requests, wait until the DOM content loads and disable JS.
Better yet, we could skip Puppeteer entirely and use a lightweight HTML parser and an HTTP request:
const cheerio = require("cheerio"); // 1.0.0-rc.12
const fs = require("node:fs/promises");
const url = "<Your URL>";
fetch(url) // Node 18 or install node-fetch, or use another library like axios
.then(res => {
if (!res.ok) {
throw Error(res.statusText);
}
return res.text();
})
.then(html => {
const $ = cheerio.load(html);
const data = [...$(".fc-item__container")].map(e => {
const text = s => $(e).find(s).first().text().trim();
return {
kicker: text(".fc-item__header .fc-item__kicker"),
headline: text(".fc-item__header .js-headline-text"),
link: $(e).find(".fc-item__header a")?.attr("href"),
standfirst: text(".fc-item__standfirst-wrapper .fc-item__standfirst"),
};
});
return fs.writeFile("headlines.json", JSON.stringify(data, null, 2));
});
I'm using the promises fs
API to avoid a race condition and callback ugliness.