So I am trying to scrape a page on twitter to get the tweets:
I want to get the elements; text, image, video separately but I keep getting an empty Array
//Scraper.js
const puppeteer = require('puppeteer');
const fs = require('fs');
async function scrapeTwitter() {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://twitter.com/coindesk');
await page.waitForLoadState('networkidle2');
const html = await page.content();
const $ = cheerio.load(html);
const tweets = $('[data-testid="tweet"]');
const posts = [];
tweets.each(function () {
const text = $(this).find('.tweet-text').text().trim();
const image = $(this).find('.tweet-image').attr('src');
const video = $(this).find('.tweet-video').attr('src');
posts.push({ text, image, video });
});
await browser.close();
return posts;
} catch (error) {
console.error('Error scraping Twitter:', error);
return [];
}
}
module.exports = scrapeTwitter;
I wouldn't use Cheerio with Puppeteer. Puppeteer already works with the live page, so there's no sense in serializing the whole thing to dump into a static HTML parser. If you want to scrape more tweets, you'll need to scroll down, then re-snapshot the whole page to keep Cheerio in sync with the dynamic site.
Also, avoid spurious waitForLoadState
calls. goto
already has a {waitFor: "networkidle2"}
option, so I'd use that rather than slapping a second one after it, which can cause strange issues.
Beyond that, your selectors don't return anything for me. Perhaps try something like:
const puppeteer = require("puppeteer"); // ^22.6.0
const url = "<Your URL>";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const ua =
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36";
await page.setUserAgent(ua);
await page.setRequestInterception(true);
const blockedResources = ["stylesheet", "font"];
page.on("request", req => {
if (blockedResources.includes(req.resourceType())) {
req.abort();
} else {
req.continue();
}
});
await page.goto(url, {waitUntil: "domcontentloaded"});
const tweetSel = '[data-testid="tweet"]';
await page.waitForSelector(tweetSel);
const data = [];
for (let i = 0; i < 50 && data.length < 20; i++) {
const preLen = await page.$$eval(
tweetSel,
els => els.length
);
await page.keyboard.press("PageDown");
try {
await page.waitForFunction(
`document.querySelectorAll('${tweetSel}').length > ${preLen}`,
{timeout: 2_000}
);
} catch (err) {
// ...
}
const chunk = await page.$$eval(tweetSel, els =>
els.map(el => ({
text: el
.querySelector('[data-testid="tweetText"]')
.textContent.trim(),
photo: el
.querySelector('[data-testid="tweetPhoto"] img')
?.getAttribute("src"),
}))
);
for (const e of chunk) {
if (data.every(f => f.text !== e.text)) {
data.push(e);
}
}
}
console.log(data);
console.log(data.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Intercepting API responses is probably more reliable than touching the DOM; there's plenty of room for improvement here and timings are somewhat arbitrary guesses.
Disclosure: I'm the author of the linked blog posts.