Search code examples
sqldb2

Logically backfilling rows in SQL


I have this table in SQL:

CREATE TABLE myt 
(
    name VARCHAR(50) NOT NULL,
    year INTEGER NOT NULL,
    gender CHAR(1) NOT NULL CHECK (gender IN ('M', 'F')),
    country_of_birth VARCHAR(50) NOT NULL,
    age INTEGER NOT NULL CHECK (age > 0),
    number_of_football_games_attended INTEGER CHECK (number_of_football_games_attended >= 0),
    PRIMARY KEY (name, year)
);

INSERT INTO myt (name, year, gender, country_of_birth, age, number_of_football_games_attended)
VALUES 
    ('John', 2010, 'M', 'USA', 25, 3),
    ('John', 2015, 'M', 'USA', 30, 8),
    ('John', 2020, 'M', 'USA', 35, 12),
    
    ('Maria', 2012, 'F', 'Brazil', 22, 2),
    ('Maria', 2014, 'F', 'Brazil', 24, 5),
    ('Maria', 2019, 'F', 'Brazil', 29, 15),
    
    
    ('Sofia', 2018, 'F', 'Spain', 31, 7),
    ('Sofia', 2019, 'F', 'Spain', 32, 9),
    ('Sofia', 2021, 'F', 'Spain', 34, 13)
);

  name year gender country_of_birth age number_of_football_games_attended
  John 2010      M              USA  25                                 3
  John 2015      M              USA  30                                 8
  John 2020      M              USA  35                                12
 Maria 2012      F           Brazil  22                                 2
 Maria 2014      F           Brazil  24                                 5
 Maria 2019      F           Brazil  29                                15
 Sofia 2018      F            Spain  31                                 7
 Sofia 2019      F            Spain  32                                 9
 Sofia 2021      F            Spain  34                                13

My goal is to add missing rows of data for each person between their min and max years and backfill the columns with logical values (e.g. gender always stays the same, country of birth always stays the same, age increases by 1, number of football games attended is NA since we can't logically infer this).

name year gender country_of_birth age number_of_football_games_attended data_source
  John 2010      M            USA  25                               3    IN ORIGINAL DATA
  John 2011      M            USA  26                          MISSING    BACKFILLED
  John 2012      M            USA  27                          MISSING    BACKFILLED
  John 2013      M            USA  28                          MISSING    BACKFILLED
  John 2014      M            USA  29                          MISSING    BACKFILLED
  John 2015      M            USA  30                               8    IN ORIGINAL DATA
  John 2016      M            USA  31                          MISSING    BACKFILLED
  John 2017      M            USA  32                          MISSING    BACKFILLED
  John 2018      M            USA  33                          MISSING    BACKFILLED
 John 2019      M            USA  34                          MISSING    BACKFILLED
 John 2020      M            USA  35                              12    IN ORIGINAL DATA

 Maria 2012     F         Brazil  22                               2    IN ORIGINAL DATA
 Maria 2013     F         Brazil  23                          MISSING    BACKFILLED
 Maria 2014     F         Brazil  24                               5    IN ORIGINAL DATA
 Maria 2015     F         Brazil  25                          MISSING    BACKFILLED
 Maria 2016     F         Brazil  26                          MISSING    BACKFILLED
 Maria 2017     F         Brazil  27                          MISSING    BACKFILLED
 Maria 2018     F         Brazil  28                          MISSING    BACKFILLED
 Maria 2019     F         Brazil  29                              15    IN ORIGINAL DATA

 Sofia 2018     F          Spain  31                               7    IN ORIGINAL DATA
 Sofia 2019     F          Spain  32                               9    IN ORIGINAL DATA
 Sofia 2020     F          Spain  33                          MISSING    BACKFILLED
 Sofia 2021     F          Spain  34                              13    IN ORIGINAL DATA

I tried to write the SQL code for this using a gap and island approach (I use join on 1=1 instead of a cross join so that this will work on servers that don't support cross joins):

WITH calendar AS (
    SELECT 2010 as year UNION ALL
    SELECT 2011 UNION ALL
    SELECT 2012 UNION ALL
    SELECT 2013 UNION ALL
    SELECT 2014 UNION ALL
    SELECT 2015 UNION ALL
    SELECT 2016 UNION ALL
    SELECT 2017 UNION ALL
    SELECT 2018 UNION ALL
    SELECT 2019 UNION ALL
    SELECT 2020 UNION ALL
    SELECT 2021
),
person_info AS (
    SELECT DISTINCT
        name,
        FIRST_VALUE(gender) OVER (PARTITION BY name ORDER BY year) as gender,
        FIRST_VALUE(country_of_birth) OVER (PARTITION BY name ORDER BY year) as country_of_birth,
        MIN(year) as min_year,
        MAX(year) as max_year,
        MIN(age) as start_age
    FROM myt
    GROUP BY name
),
filled_gaps AS (
    SELECT 
        p.name,
        c.year,
        p.gender,
        p.country_of_birth,
        CASE 
            WHEN m.age IS NOT NULL THEN m.age
            ELSE p.start_age + (c.year - p.min_year)
        END as age,
        COALESCE(
            CAST(m.number_of_football_games_attended AS VARCHAR),
            'MISSING'
        ) as number_of_football_games_attended,
        CASE 
            WHEN m.name IS NOT NULL THEN 'IN ORIGINAL DATA'
            ELSE 'BACKFILLED'
        END as data_source
    FROM person_info p
    JOIN calendar c ON 1=1
    LEFT JOIN myt m 
        ON m.name = p.name 
        AND m.year = c.year
    WHERE c.year BETWEEN p.min_year AND p.max_year
)
SELECT * 
FROM filled_gaps
ORDER BY name, year;

The code seems to run, but I am getting NAs instead of "MISSING":

 name year gender country_of_birth age number_of_football_games_attended      data_source
  John 2010      M              USA  25                                 3 IN ORIGINAL DATA
  John 2011      M              USA  26                                NA       BACKFILLED
  John 2012      M              USA  27                                NA       BACKFILLED
  John 2013      M              USA  28                                NA       BACKFILLED
  John 2014      M              USA  29                                NA       BACKFILLED
  John 2015      M              USA  30                                 8 IN ORIGINAL DATA
  John 2016      M              USA  31                                NA       BACKFILLED
  John 2017      M              USA  32                                NA       BACKFILLED
  John 2018      M              USA  33                                NA       BACKFILLED
  John 2019      M              USA  34                                NA       BACKFILLED
  John 2020      M              USA  35                                12 IN ORIGINAL DATA
 Maria 2012      F           Brazil  22                                 2 IN ORIGINAL DATA
 Maria 2013      F           Brazil  23                                NA       BACKFILLED
 Maria 2014      F           Brazil  24                                 5 IN ORIGINAL DATA
 Maria 2015      F           Brazil  25                                NA       BACKFILLED
 Maria 2016      F           Brazil  26                                NA       BACKFILLED
 Maria 2017      F           Brazil  27                                NA       BACKFILLED
 Maria 2018      F           Brazil  28                                NA       BACKFILLED
 Maria 2019      F           Brazil  29                                15 IN ORIGINAL DATA
 Sofia 2018      F            Spain  31                                 7 IN ORIGINAL DATA
 Sofia 2019      F            Spain  32                                 9 IN ORIGINAL DATA
 Sofia 2020      F            Spain  33                                NA       BACKFILLED
 Sofia 2021      F            Spain  34                                13 IN ORIGINAL DATA

Is there a way to fix this?


Solution

  • COALESCE might be causing the issue , try switching it with CASE and you can set the value for null, please replace this in your code :

    COALESCE(
        CAST(m.number_of_football_games_attended AS VARCHAR),
        'MISSING'
    ) as number_of_football_games_attended,
    

    with :

    CASE 
    
        WHEN m.number_of_football_games_attended IS NOT NULL THEN CAST(m.number_of_football_games_attended AS VARCHAR)
    
        ELSE 'MISSING'
    
    END as number_of_football_games_attended,