Search code examples
sqlsql-servert-sqlsubquerydynamic-sql

Dynamic SQL :: Calculate percentage of NULLs per index


I have a query that help me list all index in a database and works pretty well:

SELECT TableName = t.name, 
       IndexName = ind.name, 
       IndexId = ind.index_id, 
       ColumnId = ic.index_column_id, 
       ColumnName = col.name,
       --(SELECT SUM(CASE WHEN col.name IS NULL THEN 1 ELSE 0 END) * 100.0 / count(*) FROM t.name) as nulls_percent,
       ind.*, 
       ic.*, 
       col.*
FROM sys.indexes ind
     INNER JOIN sys.index_columns ic ON ind.object_id = ic.object_id
                                        AND ind.index_id = ic.index_id
     INNER JOIN sys.columns col ON ic.object_id = col.object_id
                                   AND ic.column_id = col.column_id
     INNER JOIN sys.tables t ON ind.object_id = t.object_id
WHERE ind.is_primary_key = 0
      AND ind.is_unique = 0
      AND ind.is_unique_constraint = 0
      AND t.is_ms_shipped = 0
ORDER BY t.name, 
         ind.name, 
         ind.index_id, 
         ic.is_included_column, 
         ic.key_ordinal;

Unfortunately if I uncomment the 6th line the word t.name is underlined in red and if I run the query I receive the error:

Invalid object name 'TableName'.

How can I make this subquery work?

The goal is to have a percentage of NULLs on each column


Solution

  • Your question and query are quite interesting and can be resolved with the help of statistics which always exist for columns participating in a index as is the case of the columns your are selecting in your query.

    For accurate results, it is beneficial to update the statistics before running the query I'm providing below.

    ;WITH cteColumnAllStats AS
    (
        SELECT
            ST_COL.object_id,
            ST_COL.column_id,
            ST_COL.stats_id,
    
            -- NOTE: order no among stats of the same column
            ROW_NUMBER()
                OVER(
                    PARTITION BY
                        ST_COL.object_id,
                        ST_COL.column_id
                    ORDER BY
                        ST_COL.stats_id
                ) AS StatsOrderNo
        FROM sys.stats ST
        INNER JOIN sys.stats_columns ST_COL
            ON  ST_COL.stats_id = ST.stats_id
            AND ST_COL.object_id = ST.object_id
    )
    ,cteColumnFirstStats AS
    (
        SELECT
            ST_COL.object_id,
            ST_COL.column_id,
    
            -- NOTES:
            -- =====
            -- this would be null if there were no statistics for the column
            -- however not in this case because we are only considering columns
            -- participating in an index and all indices have statistics behind
            -- the scenes.
            --
            -- Also consider whether the statistics have been updated:
            -- If they have, the result will be a whole number (without decimals)
            -- and the result is exact.
            -- If they have not, the result is an estimate and in most of the cases
            -- there will be decimals or even produce a negative result.
            --
            -- If you want accurate results, you need to update the statistics:
            -- EXEC sp_updatestats
            --
            SUM(ST_HIST.range_rows) + SUM(ST_HIST.equal_rows) AS NonNullsRowCount
        FROM cteColumnAllStats ST_COL
    
        -- NOTE: this is the important bit
        CROSS APPLY sys.dm_db_stats_histogram(
            ST_COL.object_id,
            ST_COL.stats_id
        ) ST_HIST
    
        WHERE   ST_COL.StatsOrderNo = 1 -- take only the first stats for the column
        GROUP BY
            ST_COL.object_id,
            ST_COL.column_id
    )
        SELECT TableName = t.name, 
               IndexName = ind.name, 
               IndexId = ind.index_id, 
               ColumnId = ic.index_column_id, 
               ColumnName = col.name,
    
               -- NOTE: included these columns for reference purposes (PLEASE REMOVE)
               SIND.rowcnt AS [RowCount],
               ST_COL.NonNullsRowCount,
               SIND.rowcnt - ST_COL.NonNullsRowCount AS NullsRowCount,
    
               --(SELECT SUM(CASE WHEN col.name IS NULL THEN 1 ELSE 0 END) * 100.0 / count(*) FROM t.name) as nulls_percent,
               CASE
                    -- NOTE: stats are definitely out of date
                    WHEN SIND.rowcnt < ST_COL.NonNullsRowCount THEN NULL
    
                    -- NOTE: stats could be out of date (good to update them first)
                    -- Also we don't want a divide by 0 hence the NULLIF
                    ELSE (SIND.rowcnt - ST_COL.NonNullsRowCount) * 100.0 / NULLIF(SIND.rowcnt, 0)
               END as nulls_percent,
    
               ind.*, 
               ic.*, 
               col.*
        FROM sys.indexes ind
             INNER JOIN sys.index_columns ic ON ind.object_id = ic.object_id
                                                AND ind.index_id = ic.index_id
             INNER JOIN sys.columns col ON ic.object_id = col.object_id
                                           AND ic.column_id = col.column_id
             INNER JOIN sys.tables t ON ind.object_id = t.object_id
    
        -- NOTE: this gives you the COUNT(*) without querying the table
        INNER JOIN sys.sysindexes SIND
            ON  SIND.id = t.object_id
    
            -- NOTE:
            -- 0 means Heap
            -- 1 means Clustered Index
            -- Only these are reliable to use their rowcnt.
            -- There's always 1 of these and not the other.
            AND SIND.indid < 2
    
        -- NOTE: inner join is OK here because all columns participating in a index
        -- have associated statistics
        INNER JOIN cteColumnFirstStats ST_COL
            ON  ST_COL.object_id = t.object_id
            AND ST_COL.column_id = col.column_id
    
        WHERE ind.is_primary_key = 0
              AND ind.is_unique = 0
              AND ind.is_unique_constraint = 0
              AND t.is_ms_shipped = 0
        ORDER BY t.name, 
                 ind.name, 
                 ind.index_id, 
                 ic.is_included_column, 
                 ic.key_ordinal;