I am using "puppeteer": "^19.11.1",
:
I created this function to press the consent button on this Page:
This is my function:
async function handleConsent(page, logger) {
const consentButtonSelector =
'#uc-center-container > div.sc-eBMEME.ixkACg > div > div > div > button.sc-dcJsrY.bSKKNx';
try {
// Wait for the iframe to load
await page.waitForSelector("iframe", { timeout: 3000 });
// Identify the iframe that contains the consent button
const iframeElement = await page.$(
'iframe[name="__tcfapiLocator"]'
);
if (iframeElement) {
const iframeContent = await iframeElement.contentFrame();
// Attempt to click the consent button within the iframe
const consentButton = await iframeContent.$(consentButtonSelector);
if (consentButton) {
await iframeContent.click(consentButtonSelector);
logger.info("Consent button clicked inside the iframe.");
} else {
logger.info("Consent button not found inside the iframe.");
}
} else {
logger.info("Iframe with the consent message not found.");
}
await page.waitForTimeout(3000); // Wait for any potential redirects or updates after clicking
} catch (error) {
logger.error(`An error occurred while handling consent: ${error}`);
}
}
My problem is that the selector is not found, even though I am trying to select the iframe.
Any suggestion on what I am doing wrong?
I appreciate your replies!
This answer is correct in pointing out the element you want is in a shadow root, but the solution it provides should be avoided. You can use >>>
to pierce shadow roots easily in Puppeteer:
const puppeteer = require("puppeteer"); // ^22.6.0
const url = "<Your URL>";
let browser;
(async () => {
browser = await puppeteer.launch({headless: false});
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
const acceptBtnSelector = ">>> [data-testid='uc-accept-all-button']";
const accept = await page.waitForSelector(acceptBtnSelector);
await accept.click();
await page.waitForSelector(acceptBtnSelector, {hidden: true});
await page.screenshot({path: "proof.png"})
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
And even if you couldn't use >>>
and waitForSelector
here, page.waitForFunction
is preferred over rewriting polling from scratch, which is hard to maintain and unreliable.
However, I'm betting your actual goal on the site is not simply to click an accept button for the sake of it. Your actual goal is more likely to scrape data. But most of the critical data on the page is already there in the static HTML, not rendered asynchronously, so you should be able to scrape it easily without JS or clicking any buttons:
const puppeteer = require("puppeteer"); // ^22.6.0
const url = "<Your URL>";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.setJavaScriptEnabled(false);
await page.setRequestInterception(true);
page.on("request", req =>
req.url() === url ? req.continue() : req.abort()
);
await page.goto(url, {waitUntil: "domcontentloaded"});
const data = await page.$$eval(
'[class^="EstateItem"]',
els =>
els.map(el => {
const text = s => el.querySelector(s).textContent.trim();
return {
title: text("h2"),
price: text('[data-test="price"]'),
area: text('[data-test="area"]'),
rooms: text('[data-test="rooms"]'),
location: text('[class^="estateFacts"] span'),
locationDetail: text(
'[class^="estateFacts"] div:nth-of-type(2) span'
),
provider: text('[class^="ProviderName"]'),
picture: el
.querySelector("img")
.getAttribute("data-src"),
};
})
);
console.log(data);
console.log(data.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Output snippet:
[
{
title: 'Pärchentraum - Charmante 2 Zimmer Neubauwohnung in beliebter Wohngegend - Nahe Perchtoldsdorfer Heide!',
price: '430.000 €',
area: '59.79 m²',
rooms: '2 Zi.',
location: 'Wien,Liesing (Liesing)',
locationDetail: 'Erstbezug, Neubau, Bad mit Wanne, ...',
provider: 'Bero Immobilien GmbH',
picture: 'https://ms.immowelt.org/7a82dc6d-4484-4b90-9377-9cd9d2f85c49/404cf877-f3bc-4912-ad33-1f55b1b771bd/328x224.jpg'
},
{
title: 'Singlehit! Charmante 2 Zimmer-Neubauwohnung in beliebter Wohngegend Liesing`s',
price: '299.000 €',
area: '41.23 m²',
rooms: '2 Zi.',
location: 'Wien,Liesing (Liesing)',
locationDetail: 'Erstbezug, Neubau, Loggia',
provider: 'Bero Immobilien GmbH',
picture: 'https://ms.immowelt.org/db352cce-9737-4e0d-85cf-9198af4f16aa/1e789a42-64a8-4500-8c50-6656f0dae60c/328x224.jpg'
},
// ...
20
This illustrates a common antipattern in web scraping, which is assuming you need to behave like the user would, with JS enabled, and by dutifully clicking buttons. Much of the time, there's a more direct approach that's faster to run and write, and more reliable--basically better by any metric.
At this point, you can even skip Puppeteer entirely and use native fetch
and a lightweight HTML parser like Cheerio:
const cheerio = require("cheerio"); // ^1.0.0-rc.12
const url = "<Your URL>";
fetch(url)
.then(res => {
if (!res.ok) {
throw Error(res.statusText);
}
return res.text();
})
.then(html => {
const $ = cheerio.load(html);
const data = [...$('[class^="EstateItem"]')].map(e => {
const text = s => $(e).find(s).text().trim();
return {
title: text("h2"),
price: text('[data-test="price"]'),
area: text('[data-test="area"]'),
rooms: text('[data-test="rooms"]'),
location: text('[class^="estateFacts"] span'),
locationDetail: text(
'[class^="estateFacts"] div:nth-of-type(2) span'
),
provider: text('[class^="ProviderName"]'),
picture: $(e).find("img").attr("data-src"),
};
});
console.log(data);
})
.catch(err => console.error(err));
Output is the same, but Cheerio is faster:
# optimized puppeteer:
real 0m2.015s
user 0m0.691s
sys 0m0.139s
# fetch/cheerio:
real 0m0.804s
user 0m0.282s
sys 0m0.044s
If you want to scrape multiple pages, simply add a loop on the URL pagination rather than interacting with the UI:
const cheerio = require("cheerio");
const url = "<Your Base URL>&sp="; // note the removed `sp=` page
const get = url =>
fetch(url)
.then(res => {
if (!res.ok) {
throw Error(res.statusText);
}
return res.text();
});
(async () => {
const data = [];
for (let page = 1; page < 10 /* for testing */; page++) {
const $ = cheerio.load(await get(url + page));
const chunk = [...$('[class^="EstateItem"]')].map(e => {
const text = s => $(e).find(s).text().trim();
return {
title: text("h2"),
price: text('[data-test="price"]'),
area: text('[data-test="area"]'),
rooms: text('[data-test="rooms"]'),
location: text('[class^="estateFacts"] span'),
locationDetail: text(
'[class^="estateFacts"] div:nth-of-type(2) span'
),
provider: text('[class^="ProviderName"]'),
picture: $(e).find("img").attr("data-src"),
};
});
if (!chunk.length) {
break;
}
data.push(...chunk);
}
console.log(JSON.stringify(data, null, 2));
console.log(data.length);
})()
.catch(err => console.error(err));
See Puppeteer not giving accurate HTML code for page with shadow roots for a detailed overview of shadow roots in Puppeteer.
Disclosure: I'm the author of the linked blog post.