Search code examples
node.jsasynchronousunzipnodejs-stream

Nodejs | unzip and read only top n lines of files


I have a zip in s3 which has hundreds of csv files. I am trying to stream the files and need to read the top n lines of the file. I am able to unzip it and read the content but not sure how to stop the stream when I am done reading n lines and carry on with rest of the files.

Code tried so far

const aws = require("aws-sdk");
const s3 = new aws.S3();
const etl = require("etl");
const unzip = require("unzip-stream");

function setupMetadata() {
  s3.getObject({Bucket: 'test', Key: 'CSV.zip'}).createReadStream()
    .pipe(unzip.Parse())
    .on('entry', function (entry) {
      var i = 0;
      var recordIdentifier;
      entry
      .pipe(etl.map(res => {
        if (recordIdentifier) {
          console.log(recordIdentifier);
          console.log(i++);
          // not sure about this. THis works but it only works for 1st file
          // after that the program terminates. I need to do that for all the
          // files in the zip
          entry.destroy(); 
        }
        const data = res.toString("utf-8");
        var array = data.toString().split("\n");
        if(array.length >= 3) {
          recordIdentifier = array[2].split(",")[0];
        }
      }))
    })
}

setupMetadata();

I have tried calling entry.autodrain() after reading the content but it doesn't work. entry.destroy() works but the program terminates after that.I want to do the same for all the files in the zip.

Any help will be really appreciated.

Thanks in advance.


Solution

  • I have tried to replicate a similar case. I hope you require something like this:

    const etl = require("etl");
    const unzip = require("unzip-stream");
    const fs = require('fs');
    
    function readRecord(entry, entNum) {
    
        let recordCount = 0;
        let etlcsv = entry.pipe(etl.csv())
        etlcsv.pipe(etl.map(d => {
            console.log(d);
            recordCount++;
            if (recordCount > 2) {
                etlcsv.destroy()
                entry.autodrain();
            }
        }))
    }
    
    function setupMetadata() {
        let entryCount = 0;
        let ent = {}
    
        let test = fs.createReadStream('csv.zip').pipe(unzip.Parse())
        test.on('entry', function(entry) {
            entryCount++;
            console.log(entryCount)
            readRecord(entry, entryCount)
        })
    
    }
    
    setupMetadata()
    

    Check this REPL for test: https://repl.it/@sandeepp2016/PlushFatherlyMonotone