Search code examples
azureazure-data-lakeazure-data-lake-gen2

How to calculate data size in Azure Data Lake within a timeframe. Need to access files in that range based on modified time in ADLS


I have got data nestled in folders and subfolders within Azure Data Lake. Each piece of data comes with a file name, and in ADLS, we can view the modified time. Now, I'm looking to calculate the total storage size of the data stored within a particular timeframe. How can I access files within that specific time range and calculate the total data size?


Solution

  • Below code should work after providing the start time and end time for the range you want to get the size -

    from datetime import datetime
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import col
    
    # Set up Spark session
    spark = SparkSession.builder.appName("FileCount").getOrCreate()
    
    # ADLS Gen2 storage account details
    account_name = "<account-name"
    container_name = "<container_name>"
    relative_path = "ADL_STG_NEW/attrep_change*"
    
    # Define the start and end timestamps
    start_timestamp = datetime.strptime("2023-11-16 00:00:00", "%Y-%m-%d %H:%M:%S")
    end_timestamp = datetime.strptime("2023-11-17 00:00:00", "%Y-%m-%d %H:%M:%S")
    
    # Convert timestamps to milliseconds for comparison
    start_timestamp_ms = int(start_timestamp.timestamp()) * 1000
    end_timestamp_ms = int(end_timestamp.timestamp()) * 1000
    
    print(start_timestamp_ms)
    print(end_timestamp_ms)
    
    # ADLS Gen2 path
    adls_base_path = f"abfss://{container_name}@{account_name}.dfs.core.windows.net/{relative_path}"
    
    
    def get_dir_content(ls_path):
        for dir_path in dbutils.fs.ls(ls_path):
            if dir_path.isDir() and ls_path != dir_path.path:
                yield dir_path.path
        
    list(get_dir_content(adls_base_path))
    
    
    for adls_path in list(get_dir_content(adls_base_path)):
        file_list = dbutils.fs.ls(adls_path)
        filtered_files = [
            (file.name, file.modificationTime, file.size) for file in file_list
            if start_timestamp_ms <= file.modificationTime <= end_timestamp_ms
        ]
        file_count = len(filtered_files)
        total_size = sum(file_info[2] for file_info in filtered_files)
        total_size = total_size/(1024*1024)
        print(f"{adls_path},{total_size},{file_count}")
        
    start_timestamp_ms = int(start_timestamp.timestamp()) * 1000
    end_timestamp_ms = int(end_timestamp.timestamp()) * 1000
    
    file_list = dbutils.fs.ls(adls_path)
    
    filtered_files = [
        (file.name, file.modificationTime, file.size) for file in file_list
        if start_timestamp_ms <= file.modificationTime <= end_timestamp_ms
    ]
    file_count = len(filtered_files)
    print(f"Number of files created between {start_timestamp} and {end_timestamp}: {file_count}")
    
    
    total_size = sum(file_info[2] for file_info in filtered_files)
    total_size = total_size/(1024*1024)
    
    print(f"{relative_path},{total_size},{file_count}")
    
    print(f"Total size of files: {total_size} MegaBytes")