Search code examples
javascriptnode.jsexpressamazon-s3aws-sdk

AWS SDK file upload to S3 via Node/Express using stream PassThrough - file is always corrupt


It's pretty straightforward. Using this code, any image file that is uploaded, is corrupt and cannot be opened. PDFs seem fine, but I noticed it's injecting values into text-based files. It's the correct file size in s3, not zero like something went wrong. I'm not sure if it's a problem w/ Express, the SDK, or a combination of both? Is it Postman? I built something similar in a work project in March of this year, and it worked flawlessly. I no longer have access to that code to compare.

No errors, no indication of any problems.

const aws = require("aws-sdk");
const stream = require("stream");
const express = require("express");
const router = express.Router();

const AWS_ACCESS_KEY_ID = "XXXXXXXXXXXXXXXXXXXX";
const AWS_SECRET_ACCESS_KEY = "superSecretAccessKey";
const BUCKET_NAME = "my-bucket";
const BUCKET_REGION = "us-east-1";

const s3 = new aws.S3({
    region: BUCKET_REGION,
    accessKeyId: AWS_ACCESS_KEY_ID,
    secretAccessKey: AWS_SECRET_ACCESS_KEY
});

const uploadStream = key => {
    let streamPass = new stream.PassThrough();
    let params = {
        Bucket: BUCKET_NAME,
        Key: key,
        Body: streamPass
    };
    let streamPromise = s3.upload(params, (err, data) => {
        if (err) {
            console.error("ERROR: uploadStream:", err);
        } else {
            console.log("INFO: uploadStream:", data);
        }
    }).promise();
    return {
        streamPass: streamPass,
        streamPromise: streamPromise
    };
};

router.post("/upload", async (req, res) => {
    try {
        let key = req.query.file_name;
        let { streamPass, streamPromise } = uploadStream(key);
        req.pipe(streamPass);
        await streamPromise;
        res.status(200).send({ result: "Success!" });
    } catch (e) {
        res.status(500).send({ result: "Fail!" });
    }
});

module.exports = router;

Here's my package.json:

{
  "name": "expresss3streampass",
  "version": "0.0.0",
  "private": true,
  "scripts": {
    "start": "node ./bin/www"
  },
  "dependencies": {
    "aws-sdk": "^2.812.0",
    "cookie-parser": "~1.4.4",
    "debug": "~2.6.9",
    "express": "~4.16.1",
    "morgan": "~1.9.1"
  }
}

UPDATE:

After further testing, I noticed plain-text files are being changed by Postman. For example, this source file:

{
    "question_id": null,
    "position_type_id": 1,
    "question_category_id": 1,
    "position_level_id": 1,
    "question": "Do you test your code before calling it \"done\"?",
    "answer": "Candidate should respond that they at least happy path test every feature and bug fix they write.",
    "active": 1
}

...looks like this after it lands in the bucket:

----------------------------472518836063077482836177
Content-Disposition: form-data; name="file"; filename="question.json"
Content-Type: application/json

{
    "question_id": null,
    "position_type_id": 1,
    "question_category_id": 1,
    "position_level_id": 1,
    "question": "Do you test your code before calling it \"done\"?",
    "answer": "Candidate should respond that they at least happy path test every feature and bug fix they write.",
    "active": 1
}
----------------------------472518836063077482836177--

I have to think this is the problem. Postman is the only thing that changed in this equation, from when this code first worked for me. My request headers look like this:

enter image description here

I was the one who had originally added the "application/x-www-form-urlencoded" header. If I use that now, I end up with a file that has 0 bytes, in the bucket.


Solution

  • Multer is the way to go. I updated the answer, scroll down to "A Better Solution".

    It provides a few different modes, but as far as I could tell, you have to write a custom storage handler in order to access the underlying Stream, otherwise it's going to buffer all the data in memory and only callback once it's done.

    If you check req.file in your route handler, Multer would normally provide a Buffer under the buffer field, but it's no longer present as I don't pass anything along in the callback, so I'm reasonably confident this is streaming as expected.

    Below is a working solution.

    Note: parse.single('image') is passed into the route handler. This refers to the multi-part field name I used.

    const aws = require('aws-sdk');
    const stream = require('stream');
    const express = require('express');
    const router = express.Router();
    const multer = require('multer')
    
    const AWS_ACCESS_KEY_ID = "XXXXXXXXXXXXXXXXXXXX";
    const AWS_SECRET_ACCESS_KEY = "superSecretAccessKey";
    const BUCKET_NAME = "my-bucket";
    const BUCKET_REGION = "us-east-1";
    
    const s3 = new aws.S3({
        region: BUCKET_REGION,
        accessKeyId: AWS_ACCESS_KEY_ID,
        secretAccessKey: AWS_SECRET_ACCESS_KEY
    });
    
    const uploadStream = key => {
        let streamPass = new stream.PassThrough();
        let params = {
            Bucket: BUCKET_NAME,
            Key: key,
            Body: streamPass
        };
        let streamPromise = s3.upload(params, (err, data) => {
            if (err) {
                console.error('ERROR: uploadStream:', err);
            } else {
                console.log('INFO: uploadStream:', data);
            }
        }).promise();
        return {
            streamPass: streamPass,
            streamPromise: streamPromise
        };
    };
    
    class CustomStorage {
        _handleFile(req, file, cb) {
            let key = req.query.file_name;
            let { streamPass, streamPromise } = uploadStream(key);
            file.stream.pipe(streamPass)
            streamPromise.then(() => cb(null, {}))
        }
    }
    
    const storage = new CustomStorage();
    const parse = multer({storage});
    
    router.post('/upload', parse.single('image'), async (req, res) => {
        try {
            res.status(200).send({ result: 'Success!' });
        } catch (e) {
            console.log(e)
            res.status(500).send({ result: 'Fail!' });
        }
    });
    
    module.exports = router;
    

    Update: A Better Solution

    The Multer based solution I provided above is a bit hacky. So I took a look under the hood to see how it worked. This solution just uses Busboy to parse and stream the file. Multer is really just a wrapper for this with some disk I/O convenience functions.

    const aws = require('aws-sdk');
    const express = require('express');
    const Busboy = require('busboy');
    const router = express.Router();
    
    const AWS_ACCESS_KEY_ID = "XXXXXXXXXXXXXXXXXXXX";
    const AWS_SECRET_ACCESS_KEY = "superSecretAccessKey";
    const BUCKET_NAME = "my-bucket";
    const BUCKET_REGION = "us-east-1";
    
    const s3 = new aws.S3({
        region: BUCKET_REGION,
        accessKeyId: AWS_ACCESS_KEY_ID,
        secretAccessKey: AWS_SECRET_ACCESS_KEY
    });
    
    function multipart(request){
        return new Promise(async (resolve, reject) => {
            const headers = request.headers;
            const busboy = new Busboy({ headers });
            // you may need to add cleanup logic using 'busboy.on' events
            busboy.on('error', err => reject(err))
            busboy.on('file', function (fieldName, fileStream, fileName, encoding, mimeType) {
                const params = {
                    Bucket: BUCKET_NAME,
                    Key: fileName,
                    Body: fileStream
                };
                s3.upload(params).promise().then(() => resolve());
            })
            request.pipe(busboy)
        })
    }
    
    router.post('/upload', async (req, res) => {
        try {
            await multipart(req)
            res.status(200).send({ result: 'Success!' });
        } catch (e) {
            console.log(e)
            res.status(500).send({ result: 'Fail!' });
        }
    });
    
    module.exports = router;