Search code examples
node.jsweb-scrapingpuppeteer

How can I improve the speed of my scraping program?


I have a personal project where I want to analyze the data from the association of animal protection in France (SPA). So i use puppeteer in order to scrap the website. It's my first program in node.js. So if someone have an idea how can I improve the speed of this one (1 hours it's ok but I'm looking for improve it!). This is my script:

const puppeteer = require('puppeteer');
const fs = require('fs');

async function run() {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    
    await page.goto('https://www.la-spa.fr/adoption/');

    console.log('Page loaded');
    
    // Accept cookies
    await page.waitForSelector('#gdpr-accept');
    await page.click('#gdpr-accept');

    console.log('Cookies accepted');

    // Scroll down
    for (let i = 0; i < 4; i++) {
        await page.evaluate(() => {
            window.scrollBy(0, window.innerHeight);
        });
        await new Promise(resolve => setTimeout(resolve, 5000));
    }

    console.log('Scrolled down');

    let seeMoreActive = true;
    let animalData = [];
    let processedIds = new Set();
    let counter = 0;

    while (seeMoreActive) {
        counter++;
        try {
            await page.waitForSelector('.c-see-more_link', { timeout: 5000 });
            await page.click('.c-see-more_link');
            await new Promise(resolve => setTimeout(resolve, 5000));
        } catch (error) {
            seeMoreActive = false;
        }

        // Get new animal data
        const newAnimalIds = await page.$$eval('a[data-animal-id]', elements => elements.map(element => element.getAttribute('data-animal-id')));
        const newAnimalLinks = await page.$$eval('a[data-animal-id]', elements => elements.map(element => element.href));
        const newAnimalRaces = await page.$$eval('a[data-animal-id]', elements => elements.map(element => element.getAttribute('data-animal-race')));
        const newAnimalNames = await page.$$eval('a[data-animal-id]', elements => elements.map(element => element.getAttribute('data-animal-nom')));
        const newAnimalGenders = await page.$$eval('a[data-animal-id]', elements => elements.map(element => element.getAttribute('data-animal-gender')));
        const newAnimalAges = await page.$$eval('a[data-animal-id]', elements => elements.map(element => element.getAttribute('data-animal-age')));
        const newAnimalSos = await page.$$eval('a[data-animal-id]', elements => elements.map(element => element.getAttribute('data-animal-sos')));
        const newAnimalSpecies = await page.$$eval('a[data-animal-id]', elements => elements.map(element => element.getAttribute('data-animal-espece')));
        const newAnimalEstablishments = await page.$$eval('a.f-miniAnimals_establishment span', elements => elements.map(element => element.textContent));

        console.log(`Iteration ${counter}:`);
        console.log('New Animal IDs:', newAnimalIds);
        console.log('New Animal Links:', newAnimalLinks);
        console.log('New Animal Races:', newAnimalRaces);
        console.log('New Animal Names:', newAnimalNames);
        console.log('New Animal Genders:', newAnimalGenders);
        console.log('New Animal Ages:', newAnimalAges);
        console.log('New Animal SOS:', newAnimalSos);
        console.log('New Animal Species:', newAnimalSpecies);
        console.log('New Animal Establishments:', newAnimalEstablishments);

        // Process new animals
        for (let i = 0; i < newAnimalIds.length; i++) {
            if (!processedIds.has(newAnimalIds[i])) {
                processedIds.add(newAnimalIds[i]);
                animalData.push({
                    id: newAnimalIds[i],
                    link: newAnimalLinks[i],
                    race: newAnimalRaces[i],
                    age: newAnimalAges[i],
                    sos: newAnimalSos[i],
                    genders: newAnimalGenders[i],
                    species: newAnimalSpecies[i],
                    name: newAnimalNames[i],
                    establishment: newAnimalEstablishments[i]
                });
            }
        }

    }

    console.log('Scraping finish');
    const timestamp = new Date().getTime();
    fs.writeFileSync(`animal_data_${timestamp}.json`, JSON.stringify(animalData, null, 2));

    await browser.close();
}

run();

Solution

  • You can use the API, only one request in less than a second:

    async function getAllPages() {
      const baseUrl = "https://www.la-spa.fr/app/wp-json/spa/v1/animals/search/?api=1&seed=1";
      const pageCount = await fetch(baseUrl).then(r => r.json()).then(o => o.nb_pages);
      const results = await fetch(`${baseUrl}&paged=${pageCount}&full=1`).then(r => r.json()).then(o => o.results);
      return results;
    }
    
    getAllPages().then(a => console.log(a.length));