Search code examples
sqloracle-databasehadoophiveimpala

Impala/Hive to get list of tables along with its size


I have used a query in Oracle DB to produce the list of tables in a database along with its owner and respective table size. Here is the sample query i have shared.

select owner, table_name, round((num_rows*avg_row_len)/(1024*1024)) MB 
from all_tables 
where owner not like 'SYS%'  -- Exclude system tables.
and num_rows > 0  -- Ignore empty Tables.
order by MB desc -- Biggest first.

I want the similar output from Impala/Hive.

Note: I tried show table stats <table_name> which will show the stats for individual table. But i want to get all the table stats in one go. Can someone help me in this.


Solution

  • Hive

    CLI

    show table extended like '.*'
    

    tableName:t100k
    owner:cloudera
    location:file:/home/cloudera/local/t100k
    inputformat:org.apache.hadoop.mapred.TextInputFormat
    outputformat:org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
    columns:struct columns { i32 i}
    partitioned:false
    partitionColumns:
    totalNumberFiles:1
    totalFileSize:588895
    maxFileSize:588895
    minFileSize:588895
    lastAccessTime:0
    lastUpdateTime:1492675975000
    
    tableName:t10k
    owner:cloudera
    location:file:/home/cloudera/local/t10k
    inputformat:org.apache.hadoop.mapred.TextInputFormat
    outputformat:org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
    columns:struct columns { i32 i}
    partitioned:false
    partitionColumns:
    totalNumberFiles:1
    totalFileSize:48894
    maxFileSize:48894
    minFileSize:48894
    lastAccessTime:0
    lastUpdateTime:1492675978000
    
    tableName:t1k
    owner:cloudera
    location:file:/home/cloudera/local/t1k
    inputformat:org.apache.hadoop.mapred.TextInputFormat
    outputformat:org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
    columns:struct columns { i32 i}
    partitioned:false
    partitionColumns:
    totalNumberFiles:1
    totalFileSize:3893
    maxFileSize:3893
    minFileSize:3893
    lastAccessTime:0
    lastUpdateTime:1492675983000
    
    tableName:t1m
    owner:cloudera
    location:file:/home/cloudera/local/t1m
    inputformat:org.apache.hadoop.mapred.TextInputFormat
    outputformat:org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
    columns:struct columns { i32 i}
    partitioned:false
    partitionColumns:
    totalNumberFiles:1
    totalFileSize:6888896
    maxFileSize:6888896
    minFileSize:6888896
    lastAccessTime:0
    lastUpdateTime:1492675968000
    

    Metastore (e.g. MySql)

    select    d.name                                                                                         as db_name
             ,t.tbl_name                                                                                     as tbl_name
             ,from_unixtime(min(t.create_time))                                                              as create_time
             ,min(t.owner)                                                                                   as owner
             ,min(case when tp.param_key = 'COLUMN_STATS_ACCURATE'  then tp.param_value                 end) as COLUMN_STATS_ACCURATE
             ,min(case when tp.param_key = 'last_modified_by'       then tp.param_value                 end) as last_modified_by
             ,min(case when tp.param_key = 'last_modified_time'     then from_unixtime(tp.param_value)  end) as last_modified_time  
             ,min(case when tp.param_key = 'numFiles'               then tp.param_value                 end) as numFiles
             ,min(case when tp.param_key = 'numRows'                then tp.param_value                 end) as numRows
             ,min(case when tp.param_key = 'rawDataSize'            then tp.param_value                 end) as rawDataSize
             ,min(case when tp.param_key = 'totalSize'              then tp.param_value                 end) as totalSize
             ,min(case when tp.param_key = 'transient_lastDdlTime'  then from_unixtime(tp.param_value)  end) as transient_lastDdlTime
             
    from            metastore.DBS           as d
              join  metastore.TBLS          as t
              on    t.db_id = d.db_id
              join  metastore.TABLE_PARAMS  as tp
              on    tp.tbl_id = t.tbl_id
              
    group by  d.name
             ,t.tbl_name
             
    order by  d.name
             ,t.tbl_name         
             
    

    +---------+----------+---------------------+----------+-----------------------+------------------+---------------------+----------+---------+-------------+-----------+-----------------------+
    | db_name | tbl_name |     create_time     |  owner   | COLUMN_STATS_ACCURATE | last_modified_by | last_modified_time  | numFiles | numRows | rawDataSize | totalSize | transient_lastDdlTime |
    +---------+----------+---------------------+----------+-----------------------+------------------+---------------------+----------+---------+-------------+-----------+-----------------------+
    | local   | t100k    | 2017-04-19 23:25:20 | cloudera | true                  | cloudera         | 2017-04-19 23:27:28 |        1 |  100000 |      488895 |    588895 | 2017-04-20 01:12:55   |
    | local   | t10k     | 2017-04-19 23:25:26 | cloudera | true                  | cloudera         | 2017-04-19 23:27:26 |        1 |   10000 |       38894 |     48894 | 2017-04-20 01:12:58   |
    | local   | t1k      | 2017-04-19 23:25:30 | cloudera | true                  | cloudera         | 2017-04-19 23:27:22 |        1 |    1000 |        2893 |      3893 | 2017-04-20 01:13:03   |
    | local   | t1m      | 2017-04-19 23:20:59 | cloudera | true                  | cloudera         | 2017-04-19 23:27:30 |        1 | 1000000 |     5888896 |   6888896 | 2017-04-20 01:12:48   |
    +---------+----------+---------------------+----------+-----------------------+------------------+---------------------+----------+---------+-------------+-----------+-----------------------+