Search code examples
javascriptnode.jstwitterweb-scrapingpuppeteer

Puppeteer: Scrolling down twitter timeline stops


I am having trouble with scraping all tweet URLs on a user timeline with puppeteer.

With puppeteer, the script is supposed to scroll down the timeline on each iteration of the while loop in the scrollToEnd function until it hits the bottom. In order to monitor the progress, I made the script output the value of the previousHeight variable, which is the current scrollheight of document.body evaluated everytime before the scrolling is executed.

However the scrolling stops once the output value turns 285,834. What's puzzling is that the script neither does break out of the while loop nor does the page.waitForFunction method throw a timeout error.

How should I rewrite the scrollToEnd function or any other part of the script so that the function ends properly?

Here is a snippet of my code. Irrelevant functions are left out for brevity.

const puppeteer = require('puppeteer');

var UserUrls = ['https://twitter.com/someuser'];

// more functions here

async function scrollToEnd(
    page,
    ScrollDelay = 1000
) {
    try {
        let previousHeight = 0;
        let notEnd = await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
        while (notEnd) {
            previousHeight = await page.evaluate('document.body.scrollHeight');
            await page.evaluate('window.scrollBy(0, document.body.scrollHeight)');
            await page.waitFor(ScrollDelay);

            notEnd = await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
            console.log(previousHeight)
        };
        return;
    } catch (e) {
        return;
    };
};

(async () => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    var tweetUrls = [];
    for (let UserUrl of UserUrls) {
        await page.goto(UserUrl);
        await page.evaluate((async () => {
            await scrollToEnd(page);
        })());
        await page.screenshot({ path: 'PageEnd.png' });
        tweetUrls = await getTweetUrls(page, extractItems, 100);
    };
    await browser.close();
    console.log(tweetUrls);
})();

Solution

  • Could you try one of these two approaches? This script tries to scroll to the bottom by comparing scroll heights (as you did) or waiting for the element marking the stream end to be visible. All scroll logic is placed inside functions evaluated in the browser context. Both functions return tweet count in the full page to compare the result with the user tweet count declared at the top of the timeline. Also, I've changed the delay to 3 sec for the first approach as it seems sometimes 1 sec is a too small amount for scroll height to be changed.

    'use strict';
    
    const puppeteer = require('puppeteer');
    
    (async function main() {
      try {
        const browser = await puppeteer.launch({ headless: false });
        const [page] = await browser.pages();
    
        await page.goto('https://twitter.com/GHchangelog');
        const data1 = await page.evaluate(scrollToBottomByMaxHeight);
        console.log(`Tweets: ${data1}`);
    
        await page.goto('https://twitter.com/GHchangelog');
        const data2 = await page.evaluate(scrollToBottomByEndElement);
        console.log(`Tweets: ${data2}`);
    
        // await browser.close();
      } catch (err) {
        console.error(err);
      }
    })();
    
    async function scrollToBottomByMaxHeight() {
      try {
        let previousHeight = 0;
        let currentHeight = document.scrollingElement.scrollHeight;
    
        while (previousHeight < currentHeight) {
          previousHeight = document.scrollingElement.scrollHeight;
          window.scrollBy(0, previousHeight);
          await new Promise((resolve) => { setTimeout(resolve, 3000); });
          currentHeight = document.scrollingElement.scrollHeight;
        }
    
        return document.querySelectorAll('a.js-permalink').length;
      } catch (err) {
        return err;
      }
    }
    
    async function scrollToBottomByEndElement() {
      try {
        const endElement = document.querySelector('div.stream-end');
    
        while (endElement.clientHeight === 0) {
          window.scrollBy(0, document.scrollingElement.scrollHeight);
          await new Promise((resolve) => { setTimeout(resolve, 1000); });
        }
    
        return document.querySelectorAll('a.js-permalink').length;
      } catch (err) {
        return err;
      }
    }