Search code examples
sqlsnowflake-cloud-data-platformuser-defined-functions

Equivalent in snowflake for pythonic itertools.combinations()


I have column with some jsons:

col1
[{'key1': 1}, {'key1': 2}, {'key1': 3}, {'key1': 4}]
[{'key1': 12}, {'key1': 22}, {'key1': 33}, {'key1': 44}]

I'm looking for a way to get combinations in k size without repetition and without order of 'key1''s values. For example:

Foo(col1, 2)
[(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
[(11, 22), (11, 33), (11, 44), (22, 33), (22, 44), (33, 44)]

Foo(col1, 3)
[(1, 2, 3), (1, 2, 4), (1, 3, 4), (2, 3, 4)]
[(11, 22, 33), (11, 22, 44), (11, 33, 44), (22, 33, 44)]

Foo(col1, 4)
[(1, 2, 3, 4)]
[(11, 22, 33, 44)]

The only option in my mind is using FLATTEN and k joins, which is not so elegant and won't scale.


Solution

  • So given this is looping/recursion, you could ether build a recursive CTE which sounds overly complex for dynamic length requirements, of you can bust out one of the embedded languages (Java, Python, JavaScript) and use one of the many combination solutions from that language.

    like Javascript - Generating all combinations of elements in a single array (in pairs)

    So given we want to output many rows per input row, this sounds like a UDTF:

    thus using this answer as a base:

    CREATE OR REPLACE FUNCTION combine ( /*in_array VARIANT*/ len double )
      RETURNS TABLE ( out_str TEXT )
      LANGUAGE JAVASCRIPT
      AS $$
       {
            processRow: function f(row, rowWriter, context)  {
                var s = this.choose([0,1,2,3,4], row.LEN).join('x');
              rowWriter.writeRow( {OUT_STR: s } );
            },
            initialize: function(argumentInfo, context) {
                function choose(arr, k, prefix=[]) {
                    if (k == 0) return [prefix];
                    return arr.flatMap((v, i) =>
                        choose(arr.slice(i+1), k-1, [...prefix, v])
                    );
                };
    
                this.choose = choose;
    
        }
       }
      $$;
    

    there is a UDFT the has fixed input, but takes in combination width.

    thus for 2:

    select *
    from values (1),(2) as x
    cross join table(combine(2::double)) as y;
    

    enter image description here

    and 3

    enter image description here

    so now that needs to be updated to take the array as input, and formatting for the paren bases string you are wanting.

    input array:

    CREATE OR REPLACE FUNCTION combine ( in_array VARIANT, len double )
      RETURNS TABLE ( out_str TEXT )
      LANGUAGE JAVASCRIPT
      AS $$
       {
            processRow: function f(row, rowWriter, context)  {
                var s = this.choose(row.IN_ARRAY, row.LEN).join('x');
              rowWriter.writeRow( {OUT_STR: s } );
            },
            initialize: function(argumentInfo, context) {
                function choose(arr, k, prefix=[]) {
                    if (k == 0) return [prefix];
                    return arr.flatMap((v, i) =>
                        choose(arr.slice(i+1), k-1, [...prefix, v])
                    );
                };
    
                this.choose = choose;
    
        }
       }
      $$;
    
    select *
    from values (1),(2) as x
    cross join table(combine([1,2,3]::variant,2::double)) as y;
    

    and you data type:

    with data(col1) as (
        select parse_json($1) from values
        ('[{"key1": 1}, {"key1": 2}, {"key1": 3}, {"key1": 4}]'),
        ('[{"key1": 12}, {"key1": 22}, {"key1": 33}, {"key1": 44}]')
    )
    select *
    from data as d
    cross join table(combine(d.col1,2::double)) as y;
    

    enter image description here

    which requires this one line change:

    choose(arr.slice(i+1), k-1, [...prefix, v.key1])
    

    enter image description here

    now the formatting:

    CREATE OR REPLACE FUNCTION combine ( in_array VARIANT, len double )
      RETURNS TABLE ( out_str TEXT )
      LANGUAGE JAVASCRIPT
      AS $$
       {
            processRow: function f(row, rowWriter, context)  {
                var s = this.choose(row.IN_ARRAY, row.LEN)
                    .map(m => '(' + m.join() + ')' )
                    .join(', ');
              rowWriter.writeRow( {OUT_STR: s } );
            },
            initialize: function(argumentInfo, context) {
                function choose(arr, k, prefix=[]) {
                    if (k == 0) return [prefix];
                    return arr.flatMap((v, i) =>
                        choose(arr.slice(i+1), k-1, [...prefix, v.key1])
                    );
                };
    
                this.choose = choose;
        }
       }
      $$;
    

    with 3:

    with data(col1) as (
        select parse_json($1) from values
        ('[{"key1": 1}, {"key1": 2}, {"key1": 3}, {"key1": 4}]'),
        ('[{"key1": 12}, {"key1": 22}, {"key1": 33}, {"key1": 44}]')
    )
    select *
    from data as d
    cross join table(combine(d.col1,3::double)) as y;
    

    enter image description here