Search code examples
pythonpostgresqlsqlalchemy

How to group by and aggregate Int field and array[int] fields?


So, lets say I've got some table with 2 fields - first_col is int and second_col is array[int]:

from sqlalchemy import (
    Table,
    select,
    Integer,
)
from sqlalchemy.dialects.postgresql import ARRAY



some_table = Table(
    "some_table",
    Column("id", Integer, primary_key=True),
    Column("first_col", Integer),
    Column("second_col", ARRAY(Integer)),
)

And, I want to take all objects from the db with both first_col and second_col and like aggregate them, I can show what I want in python code:

def get_some_stuff_and_aggregate() -> dict:
    query = (
        select(
            [
                some_table.c.first_col,
                some_table.c.second_col,
            ]
        )
        .select_from(some_table)
    )
    # execute query here... it's not important here
    query_res = ...

    res_dict = {}
    for some_obj in query_res:
        for some_ids in (
            [some_obj.first_col],
            some_obj.second_col,
        ):
            for some_id in some_ids:
                if some_id in res_dict:
                    res_dict[some_id] += 1
                else:
                    res_dict[some_id] = 1

    return res_dict

But, I guess I can do this at the SQL (ORM) level without this dirty and slow python aggregation, can you please help me? How to do it better, using ORM?


Solution

  • There is probably a better SQL, but this seems to do what I think you ask for.

    from sqlalchemy import Column, Integer, MetaData, Table, create_engine, func, insert, select
    from sqlalchemy.dialects.postgresql import ARRAY, array
    from sqlalchemy.orm import Session
    
    engine = create_engine("postgresql+psycopg://")
    metadata = MetaData()
    some_table = Table(
        "some_table",
        metadata,
        Column("id", Integer, primary_key=True),
        Column("first_col", Integer),
        Column("second_col", ARRAY(Integer)),
    )
    metadata.create_all(engine)
    
    dummy_data = [
        {"first_col": 1, "second_col": [1, 2, 1, 4, 5]},
        {"first_col": 3, "second_col": [2, 4, 6, 2, 10]},
        {"first_col": 6, "second_col": [3, 6, 9, 3, 15]},
        {"first_col": 4, "second_col": [4, 8, 4, 16, 20]},
        {"first_col": 2, "second_col": [5, 10, 15, 5, 25]},
    ]
    
    with Session(engine) as session:
        session.execute(insert(some_table).values(dummy_data))
        session.commit()
    
    with Session(engine) as session:
        subq = select(
            func.unnest(
                some_table.c.second_col.op("||")(array([some_table.c.first_col]))
            ).label("element")
        ).subquery()
    
        statement = select(subq.c.element, func.count("*")).group_by(subq.c.element)
    
        for i in session.execute(statement):
            print(i)
    

    This generates the following sql and output.

    SELECT
        anon_1.element,
        count(% (count_2)
            s::VARCHAR) AS count_1
    FROM (
        SELECT
            unnest(some_table.second_col || ARRAY [some_table.first_col]) AS element
        FROM
            some_table) AS anon_1
    GROUP BY
        anon_1.element
    
    (9, 1)
    (15, 2)
    (3, 3)
    (5, 3)
    (4, 5)
    (10, 2)
    (6, 3)
    (2, 4)
    (16, 1)
    (25, 1)
    (20, 1)
    (1, 3)
    (8, 1)