Search code examples
sqlsnowflake-cloud-data-platformflatten

Convert Nested Array into Columns in Snowflake


I have a table column with nested arrays in a Snowflake database. I want to convert the nested array into columns in the manner shown below in Snowflake SQL.

Table Name: SENSOR_DATA

The RX column is of data type VARIANT. The nested arrays will not always be 3 as shown below. There are cases where there are 20,000 nested arrays, and other cases where there are none.

| ID |             RX              |
|----|-----------------------------|
| 1  |[[0, 15], [1, 50], [2, 34.2]]|
| 2  |[[0, 20], [1, 25]]           |
| 3  |[[0, 100], [1, 42], [2, 76]] |

I want to achieve something like this from the table above:

| ID |Col0 | Col1| Col2|
|----|-----|-----|-----|
| 1  |  15 |  50 | 34.2|
| 2  |  20 |  25 | NULL|
| 3  | 100 |  42 | 76  |

Solution

  • Lets recreate that table:

    create table sensor_data as 
      select column1 id, parse_json(column2) rx
      from values (1, '[[0, 15], [1, 50], [2, 34.2]]')
                 ,(2, '[[0, 20], [1, 25]]')
                 ,(3, '[[0, 100], [1, 42], [2, 76]]');
    

    Then get the distinct column keys that you have:

    select distinct r.value[0] from sensor_data, table(flatten(input=>rx)) r;
    
    R.VALUE[0]
    0
    1
    2

    Given we want this to be dynamic, the next step is just to check with static SQL it gives us the correct answer:

     select id
        ,max(iff(r.value[0] = 0, r.value[1], null)) as col_0
        ,max(iff(r.value[0] = 1, r.value[1], null)) as col_1
        ,max(iff(r.value[0] = 2, r.value[1], null)) as col_2
     from sensor_data, table(flatten(input=>rx)) r
     group by 1
     order by 1;
    
    ID COL_0 COL_1 COL_2
    1 15 50 34.2
    2 20 25 null
    3 100 42 76

    write now to do this dynamically:

    declare
      sql string;
      c1 cursor for select distinct r.value[0] as key from sensor_data, table(flatten(input=>rx)) r;
    begin
      sql := 'select id ';
      for record in c1 do
        sql := sql || ',max(iff(r.value[0] = '|| record.key::text ||', r.value[1], null)) as col_' || record.key::text;
      end for;
      sql := sql || ' from sensor_data, table(flatten(input=>rx)) r  group by 1  order by 1';
      return sql;
    end;
    

    which gives us:

    select id ,max(iff(r.value[0] = 0, r.value1, null)) as col_0,max(iff(r.value[0] = 1, r.value1, null)) as col_1,max(iff(r.value[0] = 2, r.value1, null)) as col_2 from sensor_data, table(flatten(input=>rx)) r group by 1 order by 1

    which when run gives us the expected results.

    so now we want to run that:

    declare
      sql string;
      res resultset;
      c1 cursor for select distinct r.value[0] as key from sensor_data, table(flatten(input=>rx)) r;
    begin
      sql := 'select id ';
      for record in c1 do
        sql := sql || ',max(iff(r.value[0] = '|| record.key::text ||', r.value[1], null)) as col_' || record.key::text;
      end for;
      sql := sql || ' from sensor_data, table(flatten(input=>rx)) r  group by 1  order by 1';
      
      res := (execute immediate :sql);
      return table (res);
    end;
    

    gives:

    ID COL_0 COL_1 COL_2
    1 15 50 34.2
    2 20 25 null
    3 100 42 76

    based of code from these sections of the manual:

    Working with loops

    Working with Resultsets

    With extra "tricky data" and the mentioned ORDER BY:

    create or replace table sensor_data as 
      select column1 id, parse_json(column2) rx
      from values (1, '[[0, 15], [1, 50], [2, 34.2]]')
                 ,(2, '[[0, 20], [1, 25]]')
                 ,(3, '[[0, 100], [1, 42], [2, 76]]')
                 ,(4, '[[0,20],[30,50], [45, 100]]');
    
    declare
      sql string;
      res resultset;
      c1 cursor for select distinct r.value[0] as key from sensor_data, table(flatten(input=>rx)) r order by key;
    begin
      sql := 'select id ';
      for record in c1 do
        sql := sql || ',max(iff(r.value[0] = '|| record.key::text ||', r.value[1], null)) as col_' || record.key::text;
      end for;
      sql := sql || ' from sensor_data, table(flatten(input=>rx)) r  group by 1  order by 1';
      
      res := (execute immediate :sql);
      return table (res);
    end;
    

    gives:

    ID COL_0 COL_1 COL_2 COL_30 COL_45
    1 15 50 34.2 null null
    2 20 25 null null null
    3 100 42 76 null null
    4 20 null null 50 100

    So yes, but that's the MAX doing:

    so if we go back in to original data

    create or replace table sensor_data as 
      select column1 id, parse_json(column2) rx
      from values (1, '[[0, 15], [1, 50], [2, 34.2]]')
                 ,(2, '[[0, 20], [1, 25]]')
                 ,(3, '[[0, 100], [1, 42], [2, 76]]');
    

    and alter the code to not have the MAX and the GROUP BY

     select id
        ,iff(r.value[0] = 0, r.value[1], null) as col_0
        ,iff(r.value[0] = 1, r.value[1], null) as col_1
        ,iff(r.value[0] = 2, r.value[1], null) as col_2
     from sensor_data, table(flatten(input=>rx)) r
     order by 1;  
    

    we see:

    ID COL_0 COL_1 COL_2
    1 15 null null
    1 null 50 null
    1 null null 34.2
    2 20 null null
    2 null 25 null
    3 100 null null
    3 null 42 null
    3 null null 76

    so we can see those values getting unrolled, now the GROUP BY id, is going to roll those three id's up, thus for col0 we have 15, null, null MAX takes the higest value, and 15 is higher that null so that is what is keep. This is the process this is working on.