I am having around 19G of data which I am doing tar and then encrypt. I use below code to do the job.
from subprocess import call
from Crypto.Cipher import AES
from Crypto.Random import get_random_bytes
import sys
cmd = ["tar","--acls","--selinux","-czPf","./out.tar.gz","./src"]
proc = call(cmd)
data = open("./out.tar.gz", "rb").read()
key = get_random_bytes(32)
cipher = AES.new(key, AES.MODE_GCM)
ciphertext, tag = cipher.encrypt_and_digest(data)
out = open("./out.bin", "wb")
[out.write(x) for x in (cipher.nonce, tag, ciphertext)]
I am using HP Gen10 hardware with 48 CPU cores and 128G memory and 1800.3 GB HDD space. Only one core is being utilized for almost 100% and memory usage is around 43%. The overall process is taking more than a day. I look for the ways to improve the performance in the above code.
I have made significant improvements in the code after SquareRootOfTwentyThree comments:
from subprocess import call
from Crypto.Cipher import AES
from Crypto.Random import get_random_bytes
import StringIO
key = get_random_bytes(32)
def readLargeFile(filename):
with open(filename, "rb") as f:
while True:
data = f.read(1024)
if not data:
yield data
cmd = ["tar","--acls","--selinux","-czPf","./out.tar.gz","./src"]
cipher = AES.new(key, AES.MODE_GCM)
ciphertext = []
for data in readLargeFile("./out.tar.gz"):
out = open("./out.bin", "wb")
[out.write(x) for x in (cipher.nonce, cipher.digest(), b"".join(ciphertext))]
file_in = open("./out.bin", "rb")
nonce, tag, ciphertext = [file_in.read(x) for x in (16, 16, -1)]
cipher = AES.new(key, AES.MODE_GCM, nonce)
#data = cipher.decrypt_and_verify(ciphertext, tag)
data = []
for buf in StringIO.StringIO(ciphertext).read(1024):
with open("./dst/out.tar.gz", "wb") as f:
cmd = ["tar","-xzPf","./dst/out.tar.gz","-C","./dst"]
proc = call(cmd)
Encrypt is successful but decrypt's verify() is causing ValueError: MAC check failed
Note: I am using PyCryptodome v3.6.6
Somehow I successfully proceeded with decryption and below is my latest code:
#! /usr/bin/python
from subprocess import Popen,PIPE,call
from Crypto.Cipher import AES
from Crypto.Random import get_random_bytes
import StringIO,io,tarfile
import os,sys
import datetime
print "*** Encryption Starts *** " + str(datetime.datetime.now())
key = get_random_bytes(32)
def readLargeFile(filename):
with open(filename, "rb") as f:
while True:
data = f.read(1024)
if not data:
yield data
cmd = ["tar --acls --selinux -czPf /nfs/out.tar.gz ./encrypt_disk/src/*"]
call(cmd, shell=True)
cipher = AES.new(key, AES.MODE_GCM)
ciphertext = []
for data in readLargeFile("/nfs/out.tar.gz"):
out = open("/nfs/out.bin", "wb")
[out.write(x) for x in (cipher.nonce, cipher.digest(), b"".join(ciphertext))]
print "*** Encryption Ends *** " + str(datetime.datetime.now())
print "*** Decryption Starts *** " + str(datetime.datetime.now())
file_in = open("/nfs/out.bin", "rb")
nonce, tag, ciphertext = [file_in.read(x) for x in (16, 16, -1)]
cipher = AES.new(key, AES.MODE_GCM, nonce)
tar = tarfile.open(fileobj=StringIO.StringIO(cipher.decrypt_and_verify(ciphertext, tag)), mode='r|*')
print "*** Decryption Ends *** " + str(datetime.datetime.now())
GCM is hard (though not impossible) to parallelize. Still, on my 3-year x86 laptop (with AESNI and CLMUL accelerated instructions) I do get 150 MB/s with PyCryptodome's GCM. That is only 2 minutes for 19GB, not a day! I used the following toy code:
data = os.urandom(1024*1024)
cipher = AES.new(key, AES.MODE_GCM)
for _ in range(1024):
tag = cipher.digest()
The code is not directly usable for your use case, but it indicates that there might be an issue with you encrypting the full 19GB at once. Perhaps, you should instead break up the processing in chunks.
Some other comments: