Search code examples
google-bigquery

how to list ALL table sizes in a project


Is there a way to list all the table size in BigQuery?

I know a command like this:

select 
  table_id,
  sum(size_bytes)/pow(10,9) as size
from
  certain_dataset.__TABLES__
group by 
  1

But I want to know all the tables in ALL datasets.

Thanks


Solution

  • This problem got a little easier with the introduction of dynamic SQL to BigQuery scripting in 2020. Now, we can build up a query dynamically and execute it via EXECUTE IMMEDIATE.

    Somthing like this would do for most circumstances where all the datasets are in region-us:

    DECLARE dataset_names ARRAY<STRING>;
    
    SET dataset_names = (
        SELECT ARRAY_AGG(SCHEMA_NAME) FROM `region-us.INFORMATION_SCHEMA.SCHEMATA`
    );
    
    EXECUTE IMMEDIATE (
        SELECT STRING_AGG(
            (SELECT """
                SELECT project_id, dataset_id, table_id, row_count, size_bytes 
                FROM `""" || s || 
                """.__TABLES__`"""), 
                " UNION ALL ")
        FROM UNNEST(dataset_names) AS s);
    

    If there are a large number of datasets then this may return a rate limit error when trying to read all the meta data concurrently.

    If this happens then we can then fall back on a "batched" approach that's a little more complicated to read and slower/less-efficient but still gets the job done:

    DECLARE dataset_names ARRAY<STRING>;
    DECLARE batch ARRAY<STRING>;
    DECLARE batch_size INT64 DEFAULT 25;
    
    CREATE TEMP TABLE results (
        project_id STRING,
        dataset_id STRING,
        table_id STRING,
        row_count INT64,
        size_bytes INT64
    );
    
    SET dataset_names = (
            SELECT ARRAY_AGG(SCHEMA_NAME) 
            FROM `region-us.INFORMATION_SCHEMA.SCHEMATA`
        );
    
    LOOP
        IF ARRAY_LENGTH(dataset_names) < 1 THEN 
            LEAVE;
        END IF;
    
        SET batch = (
            SELECT ARRAY_AGG(d) 
            FROM UNNEST(dataset_names) AS d WITH OFFSET i 
            WHERE i < batch_size);
    
        EXECUTE IMMEDIATE (
            SELECT """INSERT INTO results """ 
                || STRING_AGG(
                        (SELECT """
                            SELECT project_id, dataset_id, table_id, row_count, size_bytes 
                            FROM `""" || s || """.__TABLES__`"""), 
                    " UNION ALL ")
            FROM UNNEST(batch) AS s);
    
        SET dataset_names = (
            SELECT ARRAY_AGG(d) 
            FROM UNNEST(dataset_names) AS d
            WHERE d NOT IN (SELECT * FROM UNNEST(batch)));
            
    END LOOP; 
    
    SELECT * FROM results;