in the last months I've been working on an exportation project, basically the application needs to get files from blob storage, unify the files in a single file, compress into a zip and upload it in blob storage. I fragmentated the process in steps. The performance is very good and the entire process is working, but when I export a lot of files, the last step crashes (beacuse my environment only have 15gb of memory and the file is bigger than that). Any ideas?
A little description of the final step and the codes:
public async Task<Dictionary<string, byte[]>> DownloadManyAsync(Guid exportId)
{
var tasks = new Queue<Task>();
var files = new ConcurrentDictionary<string, byte[]>();
var container = _blobServiceClient.GetBlobContainerClient("");
var blobs = container.GetBlobs(prefix: "");
var options = BlobStorageTools.GetOptions();
foreach (var blob in blobs)
{
tasks.Enqueue(DownloadAndEnlist(container.GetBlobClient(blob.Name), files, options, exportId));
}
await Task.WhenAll(tasks);
return files.ToDictionary(x => x.Key,
x => x.Value,
files.Comparer);
}
public async Task DownloadAndEnlist(BlobClient blob, ConcurrentDictionary<string, byte[]> files, StorageTransferOptions options, Guid exportId)
{
using var memoryStream = new MemoryStream();
await blob.DownloadToAsync(memoryStream, default, options);
files.TryAdd(blob.Name, memoryStream.ToArray());
}
using var memoryStream = new MemoryTributary();
using (var archive = new ZipArchive(memoryStream, ZipArchiveMode.Create, true))
{
for (int i = files.Count - 1; i >= 0; i--)
{
var file = files.ElementAt(i);
var zipArchiveEntry = archive.CreateEntry(file.Key, CompressionLevel.Fastest);
using var zipStream = zipArchiveEntry.Open();
zipStream.Write(file.Value, 0, file.Value.Length);
files.Remove(file.Key);
}
}
public async Task<string> SaveExport(string fileName, Stream file)
{
var cloudBlockBlob = _blobClient.GetContainerReference("").GetBlockBlobReference($"{fileName}.zip");
BlockingCollection<string> blockList = new();
Queue<Task> tasks = new();
int bytesRead;
int blockNumber = 0;
if (file.Position != 0) file.Position = 0;
do
{
blockNumber++;
string blockId = $"{blockNumber:000000000}";
string base64BlockId = Convert.ToBase64String(Encoding.UTF8.GetBytes(blockId));
byte[] buffer = new byte[8000000];
bytesRead = await file.ReadAsync(buffer);
tasks.Enqueue(Task.Run(async () =>
{
await cloudBlockBlob.PutBlockAsync(base64BlockId, new MemoryStream(buffer, 0, bytesRead) { Position = 0 }, null);
blockList.Add(base64BlockId);
}));
} while (bytesRead == 8000000);
await Task.WhenAll(tasks);
await cloudBlockBlob.PutBlockListAsync(blockList);
return cloudBlockBlob.Uri.ToString();
}
I thought in use az functions, but functions have 15gb memory limitation, i would have the same issue.
This article helped me a lot. Basically i used BlobStream to write my zip directly in storage, so the memory usage was very low. I hope this may help some future developers with the same issue.
The new code:
using (var zipFileStream = await _shareExportStorage.OpenZipFileStreamAsync(filename))
{
using (var zipOutputStream = new ZipOutputStream(zipFileStream) {IsStreamOwner = false})
{
zipOutputStream.SetLevel(4);
foreach (var file in filesListTask.Result)
{
var properties = await _exportFilesStorage.GetBlobProperties(file);
var zipEntry = new ZipEntry(file)
{
Size = properties.ContentLength
};
zipOutputStream.PutNextEntry(zipEntry);
await _exportFilesStorage.DownloadOneToStreamAsync(zipOutputStream, file);
zipOutputStream.CloseEntry();
}
}
}
public async Task<Stream> OpenZipFileStreamAsync(string fileName)
{
var zipBlobClient = new BlockBlobClient(_configuration["AzureBlobStorage:ConnectionString"], "", fileName);
return await zipBlobClient.OpenWriteAsync(true, options: new BlockBlobOpenWriteOptions
{
HttpHeaders = new BlobHttpHeaders
{
ContentType = "application/zip"
}
});
}
public async Task DownloadOneToStreamAsync(Stream destination, string blobName)
{
var container = _blobServiceClient.GetBlobContainerClient("");
var blobClient = container.GetBlobClient(blobName);
await blobClient.DownloadToAsync(destination, new BlobDownloadToOptions
{
TransferOptions = BlobStorageTools.GetOptions()
});
}