As our logging mechanism is not able to create big gz-files, I'm trying to do it with a lambda. It works when I load all of them from S3 into the memory and afterwards create the gzip file. But this needs too much memory. This is why I try the following: Start a gzip stream into memory and when I receive the content of a file from S3, I write it to the gzip stream. Without luck. Besides other ideas, I tried the code below.
I read from here https://github.com/aws/aws-sdk-js/issues/2961 that the aws-sdk needs to know the length of a stream. That's why I use the streamToBuffer function which was also described on the given link.
const aws = require('aws-sdk')
const zlib = require('zlib')
const stream = require('stream')
async function streamToBuffer(readableStream) {
const chunks = []
return new Promise((resolve, reject) => {
readableStream.on('data', (chunk) => chunks.push(Buffer.from(chunk)))
readableStream.on('error', (err) => reject(err))
readableStream.on('end', () => resolve(Buffer.concat(chunks)))
})
}
const gzip = zlib.createGzip()
gzip.setEncoding('utf8')
for (let ii = 0; ii < files.length; ii++) {
const params = {
Bucket: srcBucket,
Key: `${files[ii]}`,
};
console.log('Get:', params.Key, 'from:', params.Bucket);
var resp = await s3.getObject(params).promise().catch(err=>{
console.log(err, err.stack)
return 'Failed to list objects'
})
gzip.write(resp.Body);
}
gzip.flush()
gzip.end()
var destPath = files[0].replace(srcPrefix, destPrefix).replace('.txt','.gz')
var msg = 'merging ' + srcBucket + ':' + currentPrefix + '* to ' + srcBucket + ':' + destPath
console.log('Attempting: ' + msg);
const data = await s3.putObject({
Bucket: srcBucket,
Key: destPath,
Body: await streamToBuffer(gzip)
}).promise().catch(err => {
console.log('Error: ' + msg)
console.log(err, err.stack)
return -1
})
if (data === -1) {
return 'Error while putting new object to S3'
}
console.log('Success: ' + msg);
console.log(data);
The code puts a file to S3. However, it is not a correct gzip-file. I'm not able to open it. I know the code is not really nice. But it should work, I think. Thanks for your help.
EDIT: I forgot to mention that the log files are text files conainting json entries. There are not already gzipped.
UPDATE: I tried the same with s3.upload instead of s3.putObject using directly the gzip stream as upload should support it. It leads to the this error:
"The \"list[0]\" argument must be an instance of Buffer or Uint8Array. Received type string ('\\u001f\ufffd\\b\\u0000\\u0000\\u0...)"
Setting gzip.setEncoding('utf8')
to gzip.setEncoding(null)
as described here https://github.com/aws/aws-sdk-js/issues/2081 did not help.
I finally got it working! I don't have to set the encoding for gzip but during the write. This is my code which is creating correct gzip-files:
const aws = require('aws-sdk')
const zlib = require('zlib')
const stream = require('stream')
async function streamToBuffer(readableStream) {
const chunks = []
return new Promise((resolve, reject) => {
readableStream.on('data', (chunk) => chunks.push(Buffer.from(chunk)))
readableStream.on('error', (err) => reject(err))
readableStream.on('end', () => resolve(Buffer.concat(chunks)))
})
}
const gzip = zlib.createGzip()
for (let ii = 0; ii < files.length; ii++) {
const params = {
Bucket: srcBucket,
Key: `${files[ii]}`,
};
console.log('Get:', params.Key, 'from:', params.Bucket);
var resp = await s3.getObject(params).promise().catch(err=>{
console.log(err, err.stack)
return 'Failed to list objects'
})
// Add the encoding to create correct gzip files!
gzip.write(resp.Body, 'utf-8');
}
gzip.flush()
gzip.end()
var destPath = files[0].replace(srcPrefix, destPrefix).replace('.txt','.gz')
var msg = 'merging ' + srcBucket + ':' + currentPrefix + '* to ' + srcBucket + ':' + destPath
console.log('Attempting: ' + msg);
const data = await s3.putObject({
Bucket: srcBucket,
Key: destPath,
ContentType: "application/json",
ContentEncoding: "gzip",
Body: await streamToBuffer(gzip)
}).promise().catch(err => {
console.log('Error: ' + msg)
console.log(err, err.stack)
return -1
})
if (data === -1) {
return 'Error while putting new object to S3'
}
console.log('Success: ' + msg);
console.log(data);
The code is still not nice and I'm grateful for suggestions.