Search code examples
node.jspdf

How to read the content of a .pdf file using nodejs?


I have a scenario to automate the PDF contents. How to retrieve the content of the PDF file in nodejs.

I am completely blocked for this. Although there are few posts on pdf2jsona and jsonreader but those are not working for me. Any help will be appreciated for the same.

var pdfParser = new PDFParser();
fs.readFile(pdfFilePath, function(err, pdfBuffer) {
    pdfParser.parseBuffer(pdfBuffer);
}, function(pdfBuffer){
    pdfParser.parseBuffer(pdfBuffer);
})

Error: Invalid parameter array, need either .data or .url at FSReqWrap.readFileAfterClose [as oncomplete] (fs.js:445:3)


Solution

  • I found the answer and it's working perfectly. Install fs and pdf2json by running the below commands. npm install pdf2json and npm install fs

    var fs = require('fs');
    var PDFParser = require('pdf2json');
    var path = osHomedir();
    var homepath = path.replace(new RegExp('\\' + path.sep, 'g'), '/');
    var pdfFilePath = homepath + '/Downloads/' + 'filename.pdf';
    
    if (fs.existsSync(pdfFilePath)) {
      //Read the content of the pdf from the downloaded path
      var pdfParser = new PDFParser(browser, 1);
      pdfParser.on("pdfParser_dataError", function (errData) {
         console.error(errData.parserError)
      });
      pdfParser.on("pdfParser_dataReady", function (pdfData) {
      //console.log('here is the content: '+pdfParser.getRawTextContent());
      browser.assert.ok(pdfParser.getRawTextContent().indexOf(textToVerify) > -1);
      });
    
      pdfParser.loadPDF(pdfFilePath);
    } else {
        console.log('OOPs file not present in the downloaded folder');
        //Throw an error if the file is not found in the path mentioned
        browser.assert.ok(fs.existsSync(pdfFilePath));
    }