Empty Array when trying to scrape twitter with Puppeteer

So I am trying to scrape a page on twitter to get the tweets:

I want to get the elements; text, image, video separately but I keep getting an empty Array

//Scraper.js
const puppeteer = require('puppeteer');
const fs = require('fs');

async function scrapeTwitter() {
  try {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto('https://twitter.com/coindesk');
    await page.waitForLoadState('networkidle2');


    const html = await page.content();
    const $ = cheerio.load(html);
    const tweets = $('[data-testid="tweet"]');
    
    const posts = [];
    tweets.each(function () {
      const text = $(this).find('.tweet-text').text().trim();
      const image = $(this).find('.tweet-image').attr('src');
      const video = $(this).find('.tweet-video').attr('src');
      posts.push({ text, image, video });
    });
    
      
    await browser.close();
    
    return posts;
  } catch (error) {
    console.error('Error scraping Twitter:', error);
    return [];
  } 
}

module.exports = scrapeTwitter;

Solution

I wouldn't use Cheerio with Puppeteer. Puppeteer already works with the live page, so there's no sense in serializing the whole thing to dump into a static HTML parser. If you want to scrape more tweets, you'll need to scroll down, then re-snapshot the whole page to keep Cheerio in sync with the dynamic site.

Also, avoid spurious waitForLoadState calls. goto already has a {waitFor: "networkidle2"} option, so I'd use that rather than slapping a second one after it, which can cause strange issues.

Beyond that, your selectors don't return anything for me. Perhaps try something like:

const puppeteer = require("puppeteer"); // ^22.6.0

const url = "<Your URL>";

let browser;
(async () => {
  browser = await puppeteer.launch();
  const [page] = await browser.pages();
  const ua =
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36";
  await page.setUserAgent(ua);
  await page.setRequestInterception(true);
  const blockedResources = ["stylesheet", "font"];
  page.on("request", req => {
    if (blockedResources.includes(req.resourceType())) {
      req.abort();
    } else {
      req.continue();
    }
  });
  await page.goto(url, {waitUntil: "domcontentloaded"});
  const tweetSel = '[data-testid="tweet"]';
  await page.waitForSelector(tweetSel);
  const data = [];

  for (let i = 0; i < 50 && data.length < 20; i++) {
    const preLen = await page.$$eval(
      tweetSel,
      els => els.length
    );
    await page.keyboard.press("PageDown");

    try {
      await page.waitForFunction(
        `document.querySelectorAll('${tweetSel}').length > ${preLen}`,
        {timeout: 2_000}
      );
    } catch (err) {
      // ...
    }

    const chunk = await page.$$eval(tweetSel, els =>
      els.map(el => ({
        text: el
          .querySelector('[data-testid="tweetText"]')
          .textContent.trim(),
        photo: el
          .querySelector('[data-testid="tweetPhoto"] img')
          ?.getAttribute("src"),
      }))
    );

    for (const e of chunk) {
      if (data.every(f => f.text !== e.text)) {
        data.push(e);
      }
    }
  }

  console.log(data);
  console.log(data.length);
})()
  .catch(err => console.error(err))
  .finally(() => browser?.close());

Intercepting API responses is probably more reliable than touching the DOM; there's plenty of room for improvement here and timings are somewhat arbitrary guesses.

_{Disclosure: I'm the author of the linked blog posts.}