Search code examples
node.jsfile-uploadnodejs-streambusboy

hashing a streaming file before uploading to S3


I am trying to stream a file to S3 without storing the file to disk/ssd. I would like to have part of the hash of the file as a part of the filename when uploading to S3.

EDIT_v1:
Been trying to follow this post using busboy as the parser: Calculate a file hash and save the file. I took an example from the busboy docs and adabpted it with an answer from the post:

const server = http.createServer();
server.on('request', async (req, res) => {
  if (req.method === 'POST') {
    const bb = busboy({ headers: req.headers });

    bb.on('file', (name, file, info) => {
      const { filename, encoding, mimeType } = info;
      console.log(
        `File [${name}]: filename: %j, encoding: %j, mimeType: %j`,
        filename,
        encoding,
        mimeType
      );

      const fileHashSource = new PassThrough();
      const writeSource = new PassThrough();
      file.pipe(fileHashSource);
      file.pipe(writeSource);

      fileHashSource.resume();
      writeSource.resume();

      createFileHash(fileHashSource, (err, hash) => {
        if (err) {
          console.log('err', err)
          return res.end('some err');
        }

        const writeStream = fs.createWriteStream(`test_${hash.slice(0, 8)}.png`);
        writeStream.on('error', function(err) {
          console.log('write error', err);
          return res.end('write error')
        });
        writeStream.on('finish', function() {
          console.log('write finished')
          return res.end('done')
        });

        writeSource.pipe(writeStream);
      });
    });
    bb.on('field', (name, val, info) => {
      console.log(`Field [${name}]: value: %j`, val);
    });
    bb.on('close', () => {
      console.log('Done parsing form!');
      req.unpipe(bb);
      res.writeHead(201, { Connection: 'close' });
      res.end('done!');
    });

    req.pipe(bb);
  } else if (req.method === 'GET') {
    res.writeHead(200, { Connection: 'close' });
    res.end(`
      <body style="background-color: black">
        <form enctype="multipart/form-data" method="post">
          <label>file name
            <input type="text" name="textfield" />
          </label><br />
          <label>single file
            <input type="file" name="filefield" />
          </label><br />

          <br />
          <button type="submit">Upload</button>
        </form>
      </body>
    `);
  }
})

server.listen(3000, () => {
  console.info(`NodeJS process: ${process.pid}`)
  console.info(`Listening on port: 3000`)
});


function createFileHash(readStream, next) {
  const hash = crypto.createHash('sha1');
  hash.setEncoding('hex');

  hash.on('error', function(err) {
    console.log('hash error')
    return next(err);
  });
  hash.on('finish', function(data) {
    console.log('hash finished');
    return next(null, hash.read());
  });

  readStream.pipe(hash);
}

EDIT_v2:
see first answer below for a solution


Solution

  • I put the task flow in a pipeline, implemented late piping with PassThrough, and finally used a function that returns an async generator that uploads to S3

    const { fileStream, mimeType } = createFromBusBoy();
    const s3Source = new PassThrough();
    
    filestream.on('data', chunk => {
      s3Source.write(chunk);
    });
    filestream.on('end', () => {
      s3Source.end();
    });
    const hash = createHash('sha256');
    hash.setEncoding('hex');
    
    try {
      await pipeline(
        filestream,
        hash,
        uploadImage(s3Source, mimeType),
      );
    } catch (err) {
      console.log(err)
      throw err;
    }
    
    function uploadImage(fileStream, mimeType) {
      return async function* (source, signal) {
        let hash;
        for await (const chunk of source) {
          hash = chunk;
        }
    
        yield await uploadToS3(filestream, hash, mimeType);
      };
    }