I created a new table with repeating records with duplicates. I am trying to find the most efficient way to deduplicate records as this will be run on a table with millions of records. If you using multiple CTE's nested does it matter what your data structure is the processing is done in memory or does it write to temp tables when there is a lot of data.
create or replace table t1.cte4 as
WITH t1 AS (
SELECT 1 as id,'eren' AS last_name UNION ALL
SELECT 1 as id,'yilmaz' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 2 as id,'smith' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'brown' AS last_name
)
SELECT id,ARRAY_AGG(STRUCT(last_name)) AS last_name_rec
FROM t1
GROUP BY id;
I can remove duplicates as follows.
QUERY 1 How to dedup the concat_struct ?
select id,
STRING_AGG( distinct ln.last_name ,'~') as concat_string,
ARRAY_AGG(STRUCT( ln.last_name )) as concat_struct
from `t1.cte4`, unnest(last_name_rec) ln
group by id;
QUERY 2 Is there a better way then this to dedup?
select distinct id,
TO_JSON_STRING(ARRAY_AGG(ln.last_name) OVER (PARTITION BY id)) json_string
from `t1.cte4`, unnest(last_name_rec) ln
group by id,
ln.last_name;
How do I get it out of the table as distinct rather then using the CTE. This does not dedup.
select id, ARRAY_AGG(STRUCT( ln.last_name )) as concat_struct
from t1.cte4,
unnest(last_name_rec) ln group by id;
I can't do this.
select id, ARRAY_AGG(distinct STRUCT( ln.last_name )) as concat_struct from t1.cte4,
unnest(last_name_rec) ln group by id;
UPDATE: Decompose the struct before deduplication and then compose it back:
select id, ARRAY_AGG(STRUCT(last_name)) as concat_struct
from (
select id, ln.last_name
from cte4, unnest(last_name_rec) ln
group by id, ln.last_name
) d
group by id
(original answer based on unwanted change of table definition follows)
Just use array_agg(distinct ...)
:
WITH t1 AS (
SELECT 1 as id,'eren' AS last_name UNION ALL
SELECT 1 as id,'yilmaz' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 2 as id,'smith' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'brown' AS last_name
)
SELECT id,ARRAY_AGG(distinct last_name) AS last_name_rec
FROM t1
GROUP BY id;