Search code examples
node.jsweb-scrapingbrowserpuppeteerheadless

Fetching data with puppeteer


I try to fetch all link of images from 1st search page using Puppeteer but I only get 6 links from total 40. Here is my code :

const puppeteer = require('puppeteer');

puppeteer.launch({ headless: true }).then(async browser => {
  const page = await browser.newPage();
  await page.goto('https://shopee.vn/search?keyword=iphone%20xs' , {waitUntil: 'networkidle0'});

  const links = await page.evaluate( () => {

    let products_result = document.getElementsByClassName("_1T9dHf _3XaILN");

    let images = [];

    for(let i=0; i<products_result.length; i++){
      images[i] = products_result[i].src;
    }

    return images;

  });

  for(let i=0; i<links.length;i++){
  console.log('Links of ' + i +' images : ',links[i]);
  }
  await browser.close();
});

What should I fix to get total 40 links from 1st search page ? Thanks.


Solution

  • I believe the site in question is doing some lazy loading of images and / or some on-demand DOM manipulation.

    So, we'll try scrolling down the page, using an adaptation of this excellent answer:

    Puppeteer - scroll down until you can't anymore.

    What we also do is to take an image of the page (Open ./page.png in the directory you run the script in!), this can help you see what is loading (or not!) Code is below:

    const puppeteer = require('puppeteer');
    
    // Scroll downwards slowly
    async function scroll(page){
        await page.evaluate(async () => {
            await new Promise(resolve => {
                // Adjust as necessary
                const y = 50, speed = 20;
                let heightScrolled = 0;
    
                setInterval(() => {
                    window.scrollBy(0, y);
                    heightScrolled += y;
                    if (heightScrolled >= document.body.scrollHeight) {
                        resolve();
                    }
                }, speed);
            });
        });
    }
    
    async function getImages(url) {
        const browser = await puppeteer.launch();
        const page = await browser.newPage();
        await page.goto(url, {waitUntil: 'networkidle0'});
    
        await page.setViewport({
            width: 1200,
            height: 800
        });
    
        await scroll(page);
    
        // Take an image of the page.. see what it looks like!
        await page.screenshot({
            fullPage: true,
            path: `./page.png`
        });
    
        const links = await page.evaluate( () => {
    
            let products_result = document.getElementsByClassName("_1T9dHf _3XaILN");
            let images = [];
    
            for(let i=0; i<products_result.length; i++){
            images[i] = products_result[i].src;
            }
            return images;
        });
    
        for(let i=0; i<links.length;i++){
            console.log('Links of ' + i +' images : ',links[i]);
        }
        await browser.close();
    }
    
    let url = 'https://shopee.vn/search?keyword=iphone%20xs'
    getImages(url);