Search code examples
node.jsamazon-s3aws-sdkzlib

How to call S3.putObject (or S3.upload) and use a gzip stream as body


As our logging mechanism is not able to create big gz-files, I'm trying to do it with a lambda. It works when I load all of them from S3 into the memory and afterwards create the gzip file. But this needs too much memory. This is why I try the following: Start a gzip stream into memory and when I receive the content of a file from S3, I write it to the gzip stream. Without luck. Besides other ideas, I tried the code below.

I read from here https://github.com/aws/aws-sdk-js/issues/2961 that the aws-sdk needs to know the length of a stream. That's why I use the streamToBuffer function which was also described on the given link.

const aws = require('aws-sdk')
const zlib = require('zlib')
const stream = require('stream')

async function streamToBuffer(readableStream) {
  const chunks = []
  return new Promise((resolve, reject) => {
    readableStream.on('data', (chunk) => chunks.push(Buffer.from(chunk)))
    readableStream.on('error', (err) => reject(err))
    readableStream.on('end', () => resolve(Buffer.concat(chunks)))
  })
}

const gzip = zlib.createGzip()
gzip.setEncoding('utf8')

for (let ii = 0; ii < files.length; ii++) {
  const params = {
    Bucket: srcBucket,
    Key: `${files[ii]}`,
  };
  console.log('Get:', params.Key, 'from:', params.Bucket);
  var resp = await s3.getObject(params).promise().catch(err=>{
    console.log(err, err.stack)
    return 'Failed to list objects'
  })
  
  gzip.write(resp.Body);
}

gzip.flush()
gzip.end()

var destPath = files[0].replace(srcPrefix, destPrefix).replace('.txt','.gz')

var msg = 'merging ' + srcBucket + ':' + currentPrefix + '* to ' + srcBucket + ':' + destPath

console.log('Attempting: ' + msg);
const data = await s3.putObject({
  Bucket: srcBucket,
  Key: destPath,
  Body: await streamToBuffer(gzip)
}).promise().catch(err => {
  console.log('Error: ' + msg)
  console.log(err, err.stack)
  return -1
})

if (data === -1) {
  return 'Error while putting new object to S3'
}
  
console.log('Success: ' + msg);
console.log(data);

The code puts a file to S3. However, it is not a correct gzip-file. I'm not able to open it. I know the code is not really nice. But it should work, I think. Thanks for your help.

EDIT: I forgot to mention that the log files are text files conainting json entries. There are not already gzipped.

UPDATE: I tried the same with s3.upload instead of s3.putObject using directly the gzip stream as upload should support it. It leads to the this error:

"The \"list[0]\" argument must be an instance of Buffer or Uint8Array. Received type string ('\\u001f\ufffd\\b\\u0000\\u0000\\u0...)"

Setting gzip.setEncoding('utf8') to gzip.setEncoding(null) as described here https://github.com/aws/aws-sdk-js/issues/2081 did not help.


Solution

  • I finally got it working! I don't have to set the encoding for gzip but during the write. This is my code which is creating correct gzip-files:

    const aws = require('aws-sdk')
    const zlib = require('zlib')
    const stream = require('stream')
    
    async function streamToBuffer(readableStream) {
      const chunks = []
      return new Promise((resolve, reject) => {
        readableStream.on('data', (chunk) => chunks.push(Buffer.from(chunk)))
        readableStream.on('error', (err) => reject(err))
        readableStream.on('end', () => resolve(Buffer.concat(chunks)))
      })
    }
    
    const gzip = zlib.createGzip()
    
    for (let ii = 0; ii < files.length; ii++) {
      const params = {
        Bucket: srcBucket,
        Key: `${files[ii]}`,
      };
      console.log('Get:', params.Key, 'from:', params.Bucket);
      var resp = await s3.getObject(params).promise().catch(err=>{
        console.log(err, err.stack)
        return 'Failed to list objects'
      })
      
      // Add the encoding to create correct gzip files!
      gzip.write(resp.Body, 'utf-8');
    }
    
    gzip.flush()
    gzip.end()
    
    var destPath = files[0].replace(srcPrefix, destPrefix).replace('.txt','.gz')
    
    var msg = 'merging ' + srcBucket + ':' + currentPrefix + '* to ' + srcBucket + ':' + destPath
    
    console.log('Attempting: ' + msg);
    const data = await s3.putObject({
      Bucket: srcBucket,
      Key: destPath,
      ContentType: "application/json",
      ContentEncoding: "gzip",
      Body: await streamToBuffer(gzip)
    }).promise().catch(err => {
      console.log('Error: ' + msg)
      console.log(err, err.stack)
      return -1
    })
    
    if (data === -1) {
      return 'Error while putting new object to S3'
    }
      
    console.log('Success: ' + msg);
    console.log(data);
    

    The code is still not nice and I'm grateful for suggestions.