Search code examples
node.jsweb-scrapingpuppeteer

Empty Array when trying to scrape twitter with Puppeteer


So I am trying to scrape a page on twitter to get the tweets:

I want to get the elements; text, image, video separately but I keep getting an empty Array

//Scraper.js
const puppeteer = require('puppeteer');
const fs = require('fs');

async function scrapeTwitter() {
  try {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto('https://twitter.com/coindesk');
    await page.waitForLoadState('networkidle2');


    const html = await page.content();
    const $ = cheerio.load(html);
    const tweets = $('[data-testid="tweet"]');
    
    const posts = [];
    tweets.each(function () {
      const text = $(this).find('.tweet-text').text().trim();
      const image = $(this).find('.tweet-image').attr('src');
      const video = $(this).find('.tweet-video').attr('src');
      posts.push({ text, image, video });
    });
    
      
    await browser.close();
    
    return posts;
  } catch (error) {
    console.error('Error scraping Twitter:', error);
    return [];
  } 
}

module.exports = scrapeTwitter;

Solution

  • I wouldn't use Cheerio with Puppeteer. Puppeteer already works with the live page, so there's no sense in serializing the whole thing to dump into a static HTML parser. If you want to scrape more tweets, you'll need to scroll down, then re-snapshot the whole page to keep Cheerio in sync with the dynamic site.

    Also, avoid spurious waitForLoadState calls. goto already has a {waitFor: "networkidle2"} option, so I'd use that rather than slapping a second one after it, which can cause strange issues.

    Beyond that, your selectors don't return anything for me. Perhaps try something like:

    const puppeteer = require("puppeteer"); // ^22.6.0
    
    const url = "<Your URL>";
    
    let browser;
    (async () => {
      browser = await puppeteer.launch();
      const [page] = await browser.pages();
      const ua =
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36";
      await page.setUserAgent(ua);
      await page.setRequestInterception(true);
      const blockedResources = ["stylesheet", "font"];
      page.on("request", req => {
        if (blockedResources.includes(req.resourceType())) {
          req.abort();
        } else {
          req.continue();
        }
      });
      await page.goto(url, {waitUntil: "domcontentloaded"});
      const tweetSel = '[data-testid="tweet"]';
      await page.waitForSelector(tweetSel);
      const data = [];
    
      for (let i = 0; i < 50 && data.length < 20; i++) {
        const preLen = await page.$$eval(
          tweetSel,
          els => els.length
        );
        await page.keyboard.press("PageDown");
    
        try {
          await page.waitForFunction(
            `document.querySelectorAll('${tweetSel}').length > ${preLen}`,
            {timeout: 2_000}
          );
        } catch (err) {
          // ...
        }
    
        const chunk = await page.$$eval(tweetSel, els =>
          els.map(el => ({
            text: el
              .querySelector('[data-testid="tweetText"]')
              .textContent.trim(),
            photo: el
              .querySelector('[data-testid="tweetPhoto"] img')
              ?.getAttribute("src"),
          }))
        );
    
        for (const e of chunk) {
          if (data.every(f => f.text !== e.text)) {
            data.push(e);
          }
        }
      }
    
      console.log(data);
      console.log(data.length);
    })()
      .catch(err => console.error(err))
      .finally(() => browser?.close());
    

    Intercepting API responses is probably more reliable than touching the DOM; there's plenty of room for improvement here and timings are somewhat arbitrary guesses.

    Disclosure: I'm the author of the linked blog posts.