Search code examples
pythonhuggingfacehuggingface-datasets

How can I see the size of a HuggingFace dataset before downloading it?


I want to download a HuggingFace dataset, e.g. uonlp/CulturaX:

from datasets import load_dataset
ds = load_dataset("uonlp/CulturaX", "en")

How can I see the size of a HuggingFace dataset before downloading it?


Solution

  • One can use HfApi.api.dataset_info from huggingface_hub:

    from huggingface_hub import HfApi
    
    def print_dataset_file_sizes(repo_id):
        api = HfApi()
        dataset_info = api.dataset_info(repo_id=repo_id, files_metadata=True)
    
        total_size_bytes = 0  
        print(f"File sizes for dataset '{repo_id}':\n")  
        for sibling in dataset_info.siblings:  
            filename = sibling.rfilename  
            size_in_bytes = sibling.size or 0  
            total_size_bytes += size_in_bytes  
            size_mb = size_in_bytes / (1024 * 1024)  
            print(f"  {filename}: {size_mb:.2f} MiB")  
    
        total_size_mb = total_size_bytes / (1024 * 1024)  
        print(f"\nTotal size: {total_size_mb:.2f} MiB")  
    
    print_dataset_file_sizes('uonlp/CulturaX')
    

    Outputs: Total size: 16658008.77 MiB, i.e. ~15.9 TiB.

    Note that the same code can be used to see the size of a HuggingFace model before downloading it, just replace HfApi.api.dataset_info with HfApi.api.model_info, e.g.:

    from huggingface_hub import HfApi
    
    def print_dataset_file_sizes(repo_id):
        api = HfApi()
        dataset_info = api.model_info(repo_id=repo_id, files_metadata=True)
    
        total_size_bytes = 0  
        print(f"File sizes for dataset '{repo_id}':\n")  
        for sibling in dataset_info.siblings:  
            filename = sibling.rfilename  
            size_in_bytes = sibling.size or 0  
            total_size_bytes += size_in_bytes  
            size_mb = size_in_bytes / (1024 * 1024)  
            print(f"  {filename}: {size_mb:.2f} MiB")  
    
        total_size_mb = total_size_bytes / (1024 * 1024)  
        print(f"\nTotal size: {total_size_mb:.2f} MiB")  
    
    print_dataset_file_sizes('deepseek-ai/DeepSeek-V3')
    

    outputs: Total size: 656703.88 MiB (i.e., ~641.3 GiB).