using pyppeteer in a continuous scraping mode

Every example and use case uses pyppeteer where browser is opened and close immediately. e.g. import asyncio from pyppeteer import launch

async def main():
    browser = await launch()
    page = await browser.newPage()
    await page.goto('http://someurl')
    content = await page.content()
    cookieslist=await page.cookies()
    cookiejar=createCookieJar(cookieslist)
    await browser.close()
 
asyncio.get_event_loop().run_until_complete(main())

what happen if you want to keep browser open, and continuously scrape the data? thats easily done with selenium, but with pyppeteer, it doesnt work without asyncio it seems. the other way to make it work is to save the session and re-open the browser by schedule and scrape, but that feels like a very inefficient way. Anyone tried?

Solution

You can use asyncio.Queue and continuously pump your data into the queue:

import asyncio
import traceback

from contextlib import suppress

from pyppeteer import launch

WORKERS = 10
URLS = [
    "http://airbnb.com",
    "http://amazon.co.uk",
    "http://amazon.com",
    "http://bing.com",
    "http://djangoproject.com",
    "http://envato.com",
    "http://facebook.com",
    "http://github.com",
    "http://google.co.uk",
    "http://google.com",
    "http://google.es",
    "http://google.fr",
    "http://heroku.com",
    "http://instagram.com",
    "http://linkedin.com",
    "http://live.com",
    "http://netflix.com",
    "http://rubyonrails.org",
    "http://shopify.com",
    "http://stackoverflow.com",
    "http://trello.com",
    "http://wordpress.com",
    "http://yahoo.com",
    "http://yandex.ru",
    "http://yiiframework.com",
    "http://youtube.com",
]


async def worker(q, browser):
    # One tab per worker
    page = await browser.newPage()

    with suppress(asyncio.CancelledError):
        while True:
            url = await q.get()

            try:
                await page.goto(url, {"timeout": 10000})
                html = await page.content()
            except Exception:
                traceback.print_exc()
            else:
                print(f"{url}: {len(html)}")
            finally:
                q.task_done()

    await page.close()


async def main():
    q = asyncio.Queue()
    browser = await launch(headless=True, args=["--no-sandbox"])

    tasks = []

    for _ in range(WORKERS):
        tasks.append(asyncio.create_task(worker(q, browser)))

    for url in URLS:
        await q.put(url)

    await q.join()

    for task in tasks:
        task.cancel()

    await asyncio.gather(*tasks, return_exceptions=True)

    await browser.close()


if __name__ == "__main__":
    asyncio.run(main())