Search code examples
javascripthtmlnode.jspuppeteercsv-parser

How to parse CSV correctly for Puppeteer to fill strings from CSV lines to text input on website?


I am trying to learn js/puppeteer and by building a simple web scraper to scrape books info for educational purposes. I am trying to get the web scraper to fill UPC numbers from a CSV file onto the search bar of a book website. I managed to get a the web scraper to scrape the website if I use a single UPC number.

But I have a CSV with a list of UPCs and would love for the web scraper:

  1. to read the CSV file,
  2. grab the UPC from first line,
  3. search for the UPC on website,
  4. scrape the information,
  5. grab the UPC from 2nd line,
  6. repeat 3, 4

Sample CSV:

DATE,QUANTITY,NAME,CODECONTENT,CODETYPE
2021-10-13 20:16:44 +1100,1,"Book 1","9781250035288",9
2021-10-13 20:16:40 +1100,1,"Book 2","9781847245601",9
2021-10-13 20:16:35 +1100,1,"Book 3","9780007149247",9
2021-10-13 20:16:30 +1100,1,"Book 4","9780749958084",9
2021-10-13 20:16:26 +1100,1,"Book 5","9781405920384",9

This is my code so far. I am stuck at async function for the CSV parser where its giving me an undefined result when i do a

console.log(allupcs);

Plus I am not sure how to get the

await page.type('#book-search-form > div.el-wrap.header-search-el-wrap > input.text-input','9781509847556');

to accept the UPCs

See code below:

const puppeteer = require('puppeteer');
const parse = require('csv-parser');
const fs = require('fs');

async function getupcs(){
var upcData=[];
fs.createReadStream('Book_Bulk.csv')
    .pipe(parse({delimiter: ':'}))
    .on('data', function(csvrow) {
        // console.log(+csvrow.CODECONTENT);
        //do something with csvrow
        upcData.push(+csvrow.CODECONTENT);        
    })
    .on('end',function() {
      //do something with csvData
      // return upcData;
      console.log(upcData);
    });
}

async function main(){

  // const allupcs = await upcData();

  // console.log(allupcs);

  const browser = await puppeteer.launch({ headless: false, defaultViewport: null, args: ['--start-maximized']});
  const page = await browser.newPage();
  await page.goto('https://www.bookdepository.com/');
  await page.type('#book-search-form > div.el-wrap.header-search-el-wrap > input.text-input','9781509847556');
  await page.click('#book-search-form > div.el-wrap.header-search-el-wrap > button');
  
  //Title
  await page.waitForSelector('.item-info h1');
  const title = await page.$eval('.item-info h1', h1 => h1.textContent);

  //Author
  await page.waitForSelector('div.author-info.hidden-md > span > a > span');
  const author = await page.$eval('div.author-info.hidden-md > span > a > span', span => span.innerText);

  //Genre
  await page.waitForSelector('.active a');
  const genre = await page.$eval('.active a', a => a.innerText);

  //Format
  await page.waitForSelector('.item-info li');
  const format = await page.$eval('.item-info li', li => li.innerText);

  //Publisher
  await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span'); 
  const publisher = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span', span => span.innerText);

  //Year
  await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(3) > span'); 
  const year = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(3) > span', span => span.innerText);
  const newyear = year.slice(-4)

  // Price
  try {
    await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span', { timeout: 1000 });
    const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span', span => span.innerText);
    var newprice = price.slice(-6);
  } catch {
    await page.waitForSelector('p.list-price'); 
    const price = await page.$eval('p.list-price', p => p.innerText);
    var newprice = price.slice(-6);
  } finally {
    await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price'); 
    const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price', span => span.innerText);
    var newprice = price.slice(-6);
  }

  console.log(title);
  console.log(author);
  console.log(genre);
  console.log(format);
  console.log(publisher);
  console.log(newyear);
  console.log(newprice);

  // return {
  //     title: title,
  //     author: author,
  //     genre: genre,
  //     format: format,
  //     publisher: publisher,
  //     year: newyear,
  //     price: newprice
  // }

}

main();

Updated: with code from Answer

const puppeteer = require('puppeteer');
const parse = require('csv-parser');
const fs = require('fs');


async function getpageData(page,upc){
    await page.goto('https://www.bookdepository.com/');
    await page.type('#book-search-form > div.el-wrap.header-search-el-wrap > input.text-input',upc);
    await page.click('#book-search-form > div.el-wrap.header-search-el-wrap > button');
    
    //Title
    await page.waitForSelector('.item-info h1');
    const title = await page.$eval('.item-info h1', h1 => h1.textContent);

    //Author
    await page.waitForSelector('div.author-info.hidden-md > span > a > span');
    const author = await page.$eval('div.author-info.hidden-md > span > a > span', span => span.innerText);

    //Genre
    await page.waitForSelector('.active a');
    const genre = await page.$eval('.active a', a => a.innerText);

    //Format
    await page.waitForSelector('.item-info li');
    const format = await page.$eval('.item-info li', li => li.innerText);

    //Publisher
    await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span'); 
    const publisher = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span', span => span.innerText);

    //Year
    await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(3) > span'); 
    const year = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(3) > span', span => span.innerText);
    const newyear = year.slice(-4)

    // Price
    try {
        await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span', { timeout: 1000 });
        const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span', span => span.innerText);
        var newprice = price.slice(-6);
    } catch {
        await page.waitForSelector('p.list-price'); 
        const price = await page.$eval('p.list-price', p => p.innerText);
        var newprice = price.slice(-6);
    } finally {
        await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price'); 
        const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price', span => span.innerText);
        var newprice = price.slice(-6);
    }

    
    // console.log(title);
    // console.log(author);
    // console.log(genre);
    // console.log(format);
    // console.log(publisher);
    // console.log(newyear);
    // console.log(newprice);

    return {
        title: title,
        author: author,
        genre: genre,
        format: format,
        publisher: publisher,
        year: newyear,
        price: newprice
    }

};


function readCsvAsync(filename, delimiter=',', encoding='utf-8') {
    return new Promise((resolve, reject) => {
        const rows = [];
        try {
            fs.createReadStream(filename, {encoding: encoding})
                .pipe(parse({delimiter: delimiter}))
                .on('data', (row) => rows.push(+row.CODECONTENT))
                .on('end', () => resolve(rows))
                .on('error', reject);
        } catch (err) {
            reject(err);
        }
    });
}

async function upcData() {
    try {
        const rows = await readCsvAsync('Book_Bulk.csv', ':');
        // console.log(csvData);
        // call puppeteer or whatever
        return rows;
    } catch (err) {
        console.log(err);
    }
}


async function main(){

    const allupcs = await upcData();
  
    // console.log(allupcs);
    const browser = await puppeteer.launch({ headless: false, defaultViewport: null, args: ['--start-maximized']});
    const page = await browser.newPage();
    const scrapedData = [];

    for(let upc of allupcs){
        const data = await getpageData(page,upc);
        scrapedData.push(data);
    }

    console.log(scrapedData);
  
  }

main();


Solution

  • As you have noticed, the CSV parser is asynchronous. "asynchronous" means you can't do this:

    var upcData=[];                               // 1
    fs.createReadStream('Book_Bulk.csv')          // 2
        .pipe(parse({delimiter: ':'}))
        .on('data', (csvrow) {                    // 5 6 7 8 9
            upcData.push(+csvrow.CODECONTENT);   
        })
        .on('end',function() {                    // 10
          console.log(upcData);
        });
    }
    console.log(upcData);                         // 3
    // call puppeteer or whatever                 // 4
    

    I've outlined the order of execution. The last console.log() runs immediately after you set up the read stream. upcData will not contain anything at this point.

    But it will contain data at point #10, and #5 etc will fill it.

    That means: Whatever you want to do with upcData, do it inside the 'end' event handler.

        .on('end',function() {                    // 10
          console.log(upcData);
          for (let upc of upcData) {
            // call puppeteer or whatever
          }
        });
    

    Since csv reader will give you one row per data event, you can also do things directly in the data event handler and not build an upcData array at all.

        .on('data', (csvrow) {                    // 5 6 7 8 9
            const upc = +csvrow.CODECONTENT;
            // call puppeteer or whatever
        })
    

    If you want to be able to await the whole thing, you must turn it into a promise first. In this case again the relevant step (promise resolution) happens in the end callback:

    function readCsvAsync(filename, delimiter=',', encoding='utf-8') {
        return new Promise((resolve, reject) => {
            const rows = [];
            try {
                fs.createReadStream(filename, {encoding: encoding})
                    .pipe(parse({delimiter: delimiter}))
                    .on('data', (row) => rows.push(row))
                    .on('end', () => resolve(rows))
                    .on('error', reject);
            } catch (err) {
                reject(err);
            }
        });
    }
    
    async function main() {
        try {
            const rows = await readCsvAsync('Book_Bulk.csv', ':');
            // call puppeteer or whatever
        } catch (err) {
            console.log(err);
        }
    }