Search code examples
pythonamazon-s3tartarfile

Creating a tar stream in memory from multiple file byte streams


I'm trying to create a tar stream in memory add files to it and then save it to S3. But there is some issue and the files inside the ta have zero size. Can any one please advise? Code snippet below-

def tar_and_upload(bucket, keys, dest_bucket):
    s3 = boto3.client('s3')
    file_obj = io.BytesIO()
    tar_file_obj = tarfile.open(mode = "w:gz", fileobj=file_obj)    
    response = {}
    for key in keys:
        obj = s3.get_object(Bucket=bucket, Key=key)
        _bytes = obj["Body"].read()
        _file_name = key.split("/")[-1]
        tar_file_obj.addfile(tarfile.TarInfo(_file_name), _bytes)
    tar_file_obj.close()
    try:
        obj_name = "{}.tar.gz".format(str(uuid.uuid4()))
        s3.put_object(Body=file_obj.getvalue(), Bucket=dest_bucket, Key=obj_name)
    except Exception as e:
        logging.error("Can't save tar to S3", exc_info=True)
        return

Solution

  • Okay apparently when adding byte streams to a tar, we need to explicitly specify the size. Sample code-

    import tarfile
    import uuid
    import io
    import os
    
    def tar_and_upload():
        file_obj = io.BytesIO()
        tar_file_obj = tarfile.open(mode = "w:gz", fileobj=file_obj)
        for filename in os.listdir("images"):
          print(filename)
          file_path = os.path.join("images", filename)
          #tar_file_obj.add(file_path)
          with open(file_path, "rb") as f:
            _bytes = f.read()
            tar_info = tarfile.TarInfo(filename)
            tar_info.size = len(_bytes)
            tar_file_obj.addfile(tar_info, io.BytesIO(_bytes))
        tar_file_obj.close()
        try:
            obj_name = "{}.tar.gz".format(str(uuid.uuid4()))
            object_path = os.path.join("temp", obj_name)
            with open(object_path, "wb") as f:
              f.write(file_obj.getvalue())
            print(obj_name)
        except Exception as e:
            print(str(e))
    
    if __name__ == "__main__":
        tar_and_upload()