import {chromium} from 'playwright'; // Web scraper Library
import * as fs from 'fs';
(async function () {
const chromeBrowser = await chromium.launch({ headless: true }); // Chromium launch and options
const context = await chromeBrowser.newContext({ ignoreHTTPSErrors: true ,
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
});
const page = await context.newPage();
await page.goto("https://jp.abcmouse.com/mkt/privacy/", { waitUntil: 'load', timeout: 60000 });
let content = await page.content();
fs.writeFileSync('test.html', content);
console.log("done")
})();
How do we access the Body content of this URL? I am able to extract many webpages but some of them won't work. Is there anything specific to be done for such sites ?
The page you shared as an example has most of its content inside a shadow root. As the content
function relies on document.documentElement.outerHTML
it won't pierce the shadow root. That's why it looks incomplete.