Search code examples
sqlgoogle-bigqueryduplicatesarray-agg

How to dedup array_agg in bigquery


I created a new table with repeating records with duplicates. I am trying to find the most efficient way to deduplicate records as this will be run on a table with millions of records. If you using multiple CTE's nested does it matter what your data structure is the processing is done in memory or does it write to temp tables when there is a lot of data.

create or replace table t1.cte4 as
WITH t1 AS (
  SELECT 1 as id,'eren' AS last_name UNION ALL
  SELECT 1 as id,'yilmaz' AS last_name UNION ALL
  SELECT 1 as id,'kaya' AS last_name UNION ALL
  SELECT 1 as id,'kaya' AS last_name UNION ALL
  SELECT 2 as id,'smith' AS last_name UNION ALL
  SELECT 2 as id,'jones' AS last_name UNION ALL
  SELECT 2 as id,'jones' AS last_name UNION ALL
  SELECT 2 as id,'jones' AS last_name UNION ALL
  SELECT 2 as id,'brown' AS last_name
)
SELECT id,ARRAY_AGG(STRUCT(last_name)) AS last_name_rec
FROM t1
GROUP BY id;

I can remove duplicates as follows.

QUERY 1 How to dedup the concat_struct ?
select id, 
STRING_AGG( distinct ln.last_name ,'~') as concat_string,
ARRAY_AGG(STRUCT( ln.last_name )) as concat_struct
from `t1.cte4`, unnest(last_name_rec) ln
group by id;

QUERY 1

QUERY 2 Is there a better way then this to dedup?
select distinct id, 
TO_JSON_STRING(ARRAY_AGG(ln.last_name) OVER (PARTITION BY id)) json_string
from `t1.cte4`, unnest(last_name_rec) ln
group by id,
ln.last_name;

QUERY 2

How do I get it out of the table as distinct rather then using the CTE. This does not dedup.

select id,  ARRAY_AGG(STRUCT( ln.last_name )) as concat_struct 
from t1.cte4, 
unnest(last_name_rec) ln group by id; 

I can't do this.

select id,  ARRAY_AGG(distinct STRUCT( ln.last_name )) as concat_struct from t1.cte4, 
unnest(last_name_rec) ln group by id;

Solution

  • UPDATE: Decompose the struct before deduplication and then compose it back:

    select id, ARRAY_AGG(STRUCT(last_name)) as concat_struct 
    from (
      select id, ln.last_name
      from cte4, unnest(last_name_rec) ln
      group by id, ln.last_name 
    ) d
    group by id
    

    (original answer based on unwanted change of table definition follows)

    Just use array_agg(distinct ...):

    WITH t1 AS (
      SELECT 1 as id,'eren' AS last_name UNION ALL
      SELECT 1 as id,'yilmaz' AS last_name UNION ALL
      SELECT 1 as id,'kaya' AS last_name UNION ALL
      SELECT 1 as id,'kaya' AS last_name UNION ALL
      SELECT 2 as id,'smith' AS last_name UNION ALL
      SELECT 2 as id,'jones' AS last_name UNION ALL
      SELECT 2 as id,'jones' AS last_name UNION ALL
      SELECT 2 as id,'jones' AS last_name UNION ALL
      SELECT 2 as id,'brown' AS last_name
    )
    SELECT id,ARRAY_AGG(distinct last_name) AS last_name_rec
    FROM t1
    GROUP BY id;