Search code examples
node.jsexpressbusboynodejs-stream

Node js Stream file without saving to memory


I am building an API that needs to accept file uploads. So a user can POST a file to an endpoint, the file will be sent to a virus scan, then if it's clean will be sent to storage (probably S3). So far I have achieved this with one issue: The files are temporarily saved in the applications file system. I need to design an app that doesn't store things in memory. Here is my currently working code:

app.js

const express = require('express');
const bb = require('express-busboy');

const app = express();

// Busboy modules extends the express app to handle incoming files
bb.extend(app, {
    upload: true,
    path: './tmp'
});

Routes.js

const express = require('express');
const router = express.Router();
const fileManagementService = require('./file-management-service')();

router
.route('/:fileId')
.post(async (req, res, next) => {
    try {
        const {fileId} = req.params;
        const {files} = req;
        const response = await fileManagementService.postFile(files, fileId);

        res.status(201).json(response);
    } catch (err) {
        next(err);
    }
})

file-management-service.js

const fs = require('fs');

function createUploader() {
    // POST /:fileId
    async function postFile(data, fileId) {
        const {file} = data.file;
        const fileStream = fs.createReadStream(file);
        const scanOutput = await scanFile(fileStream); // Function scans file for viruses
        const status = scanOutput.status === 'OK';
        let upload = 'NOT UPLOADED';
        if (status) {
            upload = await postS3Object({file}); // Some function that sends the file to S3 or other storage
        }
        fs.unlinkSync(file);
        return {
            fileId,
            scanned: scanOutput,
            upload 
        };
    }

    return Object.freeze({
        postFile
    });
}

module.exports = createUploader;

As mentioned, the above works as expected, the file is sent to be scanned, then sent to an S3 bucket before returning a response to the poster to that effect. However my implementation of express-busboy is storing the file in the ./tmp folder, then I'm converting this into a readable stream using fs.createReadStream(filePath); before sending it to the AV and again in the function that sends the file to S3.

This API is being hosted in a kubernetes cluster and I need to avoid creating states. How can I achieve the above without actually saving the file? I'm guessing busboy receives this file as some sort of stream, so without sounding dense, can it not just remain a stream and be piped through these functions to achieve the same outcome?


Solution

  • You can use busboy at a bit lower level and get access to it's translated readstream. Here's an example from the busboy doc that can be adapted for your situation:

    http.createServer(function(req, res) {
      if (req.method === 'POST') {
        var busboy = new Busboy({ headers: req.headers });
        busboy.on('file', function(fieldname, file, filename, encoding, mimetype) {
          var saveTo = path.join(os.tmpDir(), path.basename(fieldname));
          file.pipe(fs.createWriteStream(saveTo));
        });
        busboy.on('finish', function() {
          res.writeHead(200, { 'Connection': 'close' });
          res.end("That's all folks!");
        });
        return req.pipe(busboy);
      }
      res.writeHead(404);
      res.end();
    }).listen(8000, function() {
      console.log('Listening for requests');
    });
    

    The key part is this which I've annotated:

        // create a new busboy instance on each incoming request that has files with it
        var busboy = new Busboy({ headers: req.headers });
    
        // register for the file event
        busboy.on('file', function(fieldname, file, filename, encoding, mimetype) {
          // at this point the file argument is a readstream for the data of an uploaded file
          // you can do whatever you want with this readstream such as
          // feed it directly to your anti-virus 
    
          // this example code saves it to a tempfile
          // you would replace this with code that sends the stream to your anti-virus
          var saveTo = path.join(os.tmpDir(), path.basename(fieldname));
          file.pipe(fs.createWriteStream(saveTo));
        });
    
        // this recognizes the end of the upload stream and sends 
        // whatever you want the final http response to be
        busboy.on('finish', function() {
          res.writeHead(200, { 'Connection': 'close' });
          res.end("That's all folks!");
        });
    
        // this gets busboy started, feeding the incoming request to busboy
        // so it can start reading it and parsing it and will eventually trigger
        // one or more "file" events
        return req.pipe(busboy);
    

    When you've identified an incoming request that you want to do this custom busboy operation in, you create an instance of Busboy, pass it the headers and register for the file event. That file event gives you a new file readstream that is the converted file as a readstream. You could then pipe that stream directly to your anti-virus without ever going through the file system.