Search code examples
javascriptnode.jspromisepuppeteerfs

How to scrape multiple pages with puppeteer


I'm trying to scrape prices from multiple pages using puppeteer. What i'm having trouble with, is to write a single JSON file with all the scraped data. The problem is that if i try to write the file with the variables from inside the async function, i get an error saying that that variable hasn't been declared.

async function scrapeVMZ(url) {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto(url);

    const [vmzel1] = await page.$x('//*[@id="__layout"]/div/div[1]/section/div/div/div[2]/div[2]/div[1]/div/div/div[2]/div/div[1]/span[2]');
    const vmztxt1 = await vmzel1.getProperty('textContent');
    const vmzRawTxt1 = await vmztxt1.jsonValue();


    const [vmzel2] = await page.$x('//*[@id="__layout"]/div/div[1]/section/div/div/div[2]/div[2]/div[1]/div/div/div[2]/div/div[1]/span[4]/b');
    const vmztxt2 = await vmzel2.getProperty('textContent');
    const vmzRawTxt2 = await vmztxt2.jsonValue();

    console.log({vmzRawTxt1, vmzRawTxt2});
    const vmz01 = JSON.stringify(vmzRawTxt1);
    const vmz02 = JSON.stringify(vmzRawTxt2);
    console.log(vmz01, vmz02);
    browser.close();
}
scrapeVMZ('https://www.vmzviagens.com.br/ingressos/orlando/walt-disney-orlando');


async function scrapeMB(url) {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto(url);

    
    const [mbel1] = await page.$x('/html/body/section[3]/div/div/div[2]/div[1]/div/div[2]/a[1]/span[2]/span/div/div[2]/span');
    const mbtxt1 = await mbel1.getProperty('textContent');
    const mbRawTxt1 = await mbtxt1.jsonValue();


    
    const [mbel2] = await page.$x('/html/body/section[3]/div/div/div[2]/div[1]/div/div[2]/a[1]/span[2]/span/div/div[4]/span');
    const mbtxt2 = await mbel2.getProperty('textContent');
    const mbRawTxt2 = await mbtxt2.jsonValue();

    console.log({mbRawTxt1, mbRawTxt2});
    const mb01 = JSON.stringify(mbRawTxt1);
    const mb02 = JSON.stringify(mbRawTxt2);
    console.log(mb01, mb02);
   
    browser.close();
}
scrapeMB('https://www.ingressosmagicblue.com.br/produtos/?mpage=2');

How can i write a file, using the code above, to store inside my JSON file, the variables vmz01, vmz02 and mb01, mb02, like the example below?

let abc = {        
        "MB": {
            preco: mb01,
            preco2: mb02
        },
        "VMZ": {
            preco: vmz01,
            preco2: vmz02
        }
    };

Solution

  • When console.log appears in a function instead of returning results, that's a dead end. Return the results if you want to use them later. Since you're returning promises, you can await them in the caller, either serially or in parallel.

    There's also a lot of repeated code in your functions, and you probably don't need 2 browsers. Here's a quick refactor that runs in parallel in a single browser (the preco keys are sort of awkward--I'd suggest an array here potentially).

    const fs = require("fs").promises;
    const puppeteer = require("puppeteer"); // ^14.3.0
    
    const vmzPaths = [
      '//*[@id="__layout"]/div/div[1]/section/div/div/div[2]/div[2]/div[1]/div/div/div[2]/div/div[1]/span[2]',
      '//*[@id="__layout"]/div/div[1]/section/div/div/div[2]/div[2]/div[1]/div/div/div[2]/div/div[1]/span[4]/b',
    ];
    
    const mbPaths = [
      "/html/body/section[3]/div/div/div[2]/div[1]/div/div[2]/a[1]/span[2]/span/div/div[2]/span",
      "/html/body/section[3]/div/div/div[2]/div[1]/div/div[2]/a[1]/span[2]/span/div/div[4]/span",
    ];
    
    const scrape = async (browser, url, paths) => {
      const page = await browser.newPage();
      await page.goto(url);
      return Promise.all(paths.map(async p =>
        (await page.waitForXPath(p)).evaluate(e => e.textContent)
      ));
    };
    
    let browser;
    (async () => {
      browser = await puppeteer.launch({headless: true});
      const text = await Promise.all([
        scrape(browser, "https://www.ingressosmagicblue.com.br/produtos/?mpage=2", mbPaths),
        scrape(browser, "https://www.vmzviagens.com.br/ingressos/orlando/walt-disney-orlando", vmzPaths),
      ]);
      const names = ["MB", "VMZ"];
      const collected = Object.fromEntries(text.map((e, i) => [
        names[i], Object.fromEntries(e.map((e, i) =>
          [`preco${i === 0 ? "" : (i + 1)}`, e]
        ))
      ]));
      await fs.writeFile("out.json", JSON.stringify(collected, null, 2));
    })()
      .catch(err => console.error(err))
      .finally(() => browser?.close())
    ;
    

    As an aside, I'm not a big fan of hyper-precise, browser generated paths and selectors. These tend to be super brittle, and there's almost always a better way to choose a selector. But I haven't looked at the page in the interests of focusing on the promises issue, so I'll leave that as an exercise for the reader.