Search code examples
pythonamazon-web-servicesamazon-s3boto

How to count files inside zip in AWS S3 without downloading it?


Case: There is a large zip file in an S3 bucket which contains a large number of images. Is there a way without downloading the whole file to read the metadata or something to know how many files are inside the zip file?

When the file is local, in python i can just open it as a zipfile() and then I call the namelist() method which returns a list of all the files inside, and I can count that. However not sure how to do this when the file resides in S3 without having to download it. Also if this is possible with Lambda would be best.


Solution

  • I think this will solve your problem:

    import zlib
    import zipfile
    import io
    
    def fetch(key_name, start, len, client_s3):
        """
        range-fetches a S3 key
        """
        end = start + len - 1
        s3_object = client_s3.get_object(Bucket=bucket_name, Key=key_name, Range="bytes=%d-%d" % (start, end))
        return s3_object['Body'].read()
    
    
    def parse_int(bytes):
        """
        parses 2 or 4 little-endian bits into their corresponding integer value
        """
        val = (bytes[0]) + ((bytes[1]) << 8)
        if len(bytes) > 3:
            val += ((bytes[2]) << 16) + ((bytes[3]) << 24)
        return val
    
    
    def list_files_in_s3_zipped_object(bucket_name, key_name, client_s3):
        """
    
        List files in s3 zipped object, without downloading it. Returns the number of files inside the zip file.
        See : https://stackoverflow.com/questions/41789176/how-to-count-files-inside-zip-in-aws-s3-without-downloading-it
        Based on : https://stackoverflow.com/questions/51351000/read-zip-files-from-s3-without-downloading-the-entire-file
    
    
        bucket_name: name of the bucket
        key_name:  path to zipfile inside bucket
        client_s3: an object created using boto3.client("s3")
        """
    
        bucket = bucket_name
        key = key_name
    
        response = client_s3.head_object(Bucket=bucket_name, Key=key_name)
        size = response['ContentLength']
    
        eocd = fetch(key_name, size - 22, 22, client_s3)
    
        # start offset and size of the central directory
        cd_start = parse_int(eocd[16:20])
        cd_size = parse_int(eocd[12:16])
    
        # fetch central directory, append EOCD, and open as zipfile!
        cd = fetch(key_name, cd_start, cd_size, client_s3)
        zip = zipfile.ZipFile(io.BytesIO(cd + eocd))
    
        print("there are %s files in the zipfile" % len(zip.filelist))
    
        for entry in zip.filelist:
            print("filename: %s (%s bytes uncompressed)" % (entry.filename, entry.file_size))
        return len(zip.filelist)
    
    if __name__ == "__main__":
        import boto3
        import sys
    
        client_s3 = boto3.client("s3")
        bucket_name = sys.argv[1]
        key_name = sys.argv[2]
        list_files_in_s3_zipped_object(bucket_name, key_name, client_s3)