Search code examples
javascriptnode.jsweb-scrapingtimeoutpuppeteer

When I use a proxy flag in puppeteer, waitForSelector returns a TimeoutError


I'm doing some Puppeteer web scraping and for some reason, when I put in a proxy flag generated by creation of an account on this site (https://proxy.webshare.io/proxy/list?), it causes the waitForSelector() to get a TimeoutError. Not sure what the issue is because if I don't use a proxy, no error is thrown.

const puppeteer = require('puppeteer');
const puppeteerExtra = require('puppeteer-extra');
const pluginStealth = require('puppeteer-extra-plugin-stealth');
const proxyChain = require('proxy-chain');

async function scrape() {
  try {
    const preparePageForTests = async (page) => {

      const userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36';

      await page.setUserAgent(userAgent);

      await page.evaluateOnNewDocument(() => {
        Object.defineProperty(navigator, 'webdriver', {
          get: () => false,
        });
      });

      // Pass the Chrome Test.
      await page.evaluateOnNewDocument(() => {
        // We can mock this in as much depth as we need for the test.
        window.navigator.chrome = {
          app: {
            isInstalled: false,
          },
          webstore: {
            onInstallStageChanged: {},
            onDownloadProgress: {},
          },
          runtime: {
            PlatformOs: {
              MAC: 'mac',
              WIN: 'win',
              ANDROID: 'android',
              CROS: 'cros',
              LINUX: 'linux',
              OPENBSD: 'openbsd',
            },
            PlatformArch: {
              ARM: 'arm',
              X86_32: 'x86-32',
              X86_64: 'x86-64',
            },
            PlatformNaclArch: {
              ARM: 'arm',
              X86_32: 'x86-32',
              X86_64: 'x86-64',
            },
            RequestUpdateCheckStatus: {
              THROTTLED: 'throttled',
              NO_UPDATE: 'no_update',
              UPDATE_AVAILABLE: 'update_available',
            },
            OnInstalledReason: {
              INSTALL: 'install',
              UPDATE: 'update',
              CHROME_UPDATE: 'chrome_update',
              SHARED_MODULE_UPDATE: 'shared_module_update',
            },
            OnRestartRequiredReason: {
              APP_UPDATE: 'app_update',
              OS_UPDATE: 'os_update',
              PERIODIC: 'periodic',
            },
          }
        };
      });

      await page.evaluateOnNewDocument(() => {
        const originalQuery = window.navigator.permissions.query;
        return window.navigator.permissions.query = (parameters) => (
          parameters.name === 'notifications' ?
            Promise.resolve({ state: Notification.permission }) :
            originalQuery(parameters)
        );
      });

      await page.evaluateOnNewDocument(() => {
        // Overwrite the `plugins` property to use a custom getter.
        Object.defineProperty(navigator, 'plugins', {
          // This just needs to have `length > 0` for the current test,
          // but we could mock the plugins too if necessary.
          get: () => [1, 2, 3, 4, 5],
        });
      });

      await page.evaluateOnNewDocument(() => {
        // Overwrite the `plugins` property to use a custom getter.
        Object.defineProperty(navigator, 'languages', {
          get: () => ['en-US', 'en'],
        });
      });
    }

    const oldProxyUrl = 'http://##.##.##.##:####'
    const newProxyUrl = await proxyChain.anonymizeProxy(oldProxyUrl);

    const browser = await puppeteerExtra.launch({ 
      headless: true, 
      args: [                
        '--no-sandbox', 
        '--disable-setuid-sandbox', 
        '--proxy-server=http://##.##.##.##:####'// tried hard coding the proxy in, also tried ${newProxyUrl}`,                                      
      ]
    });

    const page = await browser.newPage();
    await page.authenticate({username: 'usernameOnWebsite', password: 'passwordOnWebsite'});
    await preparePageForTests(page);
    await page.goto(`https://www.google.com/search?q=concerts+near+new+york&client=safari&rls=en&uact=5&ibp=htl;events&rciv=evn&sa=X&fpstate=tldetail#htivrt=events&htidocid=L2F1dGhvcml0eS9ob3Jpem9uL2NsdXN0ZXJlZF9ldmVudC8yMDIxLTA2LTA0fDIxMjMzMzg4NTU2Nzc1NDk%3D&fpstate=tldetail`);   
    const results = await getResults(page) //the error occurs inside this function

  } catch(err) {
    console.log(err)
  }
}

In the getResults() function, the first line, which is a waitForSelector() has a TimeoutError fail.

async function getResults(page) {
  try {
    await page.waitForSelector("ul", { timeout: 30000 })
    //I do a bunch of stuff with the ul after this point, but timeout is happening on the line above
  } catch(err) {
    console.log(err)
  }
}

If I remove the proxy and use the built-in Heroku IP, it all works fine. Not sure what the issue is here.


Solution

  • The correct way to use password-protected proxies with proxy-chain would be this:

    // Give the credentials of the proxy to proxy-chain
    const oldProxyUrl = 'http://bob:[email protected]:8000';
    const newProxyUrl = await proxyChain.anonymizeProxy(oldProxyUrl);
    
    // Prints something like "http://127.0.0.1:45678"
    console.log(newProxyUrl);
    
    const browser = await puppeteer.launch({
        args: [`--proxy-server=${newProxyUrl}`],
    });
    
    const page = await browser.newPage();
    
    // Go to a page as usual, no need to authenticate
    await page.goto('https://www.example.com');
    

    You need to provide proxy-chain with full proxy URL including usrname and password and then use the local forwarding proxy URL like http://127.0.0.1:45678 when launching Chrome.