The following result was obtained from Part 1. bigquerqy sql link a common grid_id between groups
with t1 as
(
Select 'obrien-t j' lname_forename_long,11 grid_id_ct ,'grid.416153.4' grid_id,2 name_seq ,1 group_seq UNION ALL
Select 'obrien-t j',1,'grid.1002.3',1,1 UNION ALL
Select 'obrien-terence',2,'grid.1008.9',1,2 UNION ALL
Select 'obrien-terence',4,'grid.416153.4',2,2 UNION ALL
Select 'obrien-terence',1,'grid.484852.7',3,2 UNION ALL
Select 'obrien-terence j',14,'grid.1002.3',1,3 UNION ALL
Select 'obrien-terence j',25,'grid.1008.9',2,3 UNION ALL
Select 'obrien-terence j',3,'grid.1019.9',3,3 UNION ALL
Select 'obrien-terence j',9,'grid.1623.6',4,3 UNION ALL
Select 'obrien-terence j',40,'grid.237081.f',5,3 UNION ALL
Select 'obrien-terence j',1,'grid.267362.4',6,3 UNION ALL
Select 'obrien-terence j',2,'grid.414094.c',7,3 UNION ALL
Select 'obrien-terence j',1,'grid.416060.5',8,3 UNION ALL
Select 'obrien-terence j',36,'grid.416153.4',9,3 UNION ALL
Select 'obrien-terence j',4,'grid.453219.8',10,3 UNION ALL
Select 'obrien-terence j',3,'grid.454055.5',11,3 UNION ALL
Select 'obrien-terence j',6,'grid.474069.8',12,3 UNION ALL
Select 'obrien-terence j',13,'grid.481253.9',13,3 UNION ALL
Select 'obrien-terence john',1,'grid.1002.3',1,4 UNION ALL
Select 'obrien-terence john',1,'grid.1008.9',2,4 UNION ALL
Select 'obrien-terence john',1,'grid.1623.6',3,4 UNION ALL
Select 'obrien-terence john',1,'grid.237081.f',4,4 UNION ALL
Select 'obrien-terence john',2,'grid.416153.4',5,4 UNION ALL
Select 'obrien-terrence',2,'grid.416153.4',1,5 UNION ALL
Select 'obrien-terrence j',1,'grid.416153.4',1,6 UNION ALL
Select 'obrien-terry',1,'grid.137628.9',1,7 UNION ALL
Select 'obrien-terry',2,'grid.237081.f',2,7 UNION ALL
Select 'obrien-terry',1,'grid.267362.4',3,7 UNION ALL
Select 'obrien-timothy',1,'grid.496867.2',1,8 UNION ALL
Select 'obrien-timothy',3,'grid.6142.1',2,8
)
select *, if(count(*) over win > 0, string_agg('' || group_seq) over win, '') links
from t1
window win as (partition by grid_id) ;
The above does not include a counts column which I think maybe needed.
lname_forename_long | grid_id_ct | grid_id | name_seq | group_seq | links | link_counts |
---|---|---|---|---|---|---|
obrien-t j | 11 | grid.416153.4 | 2 | 1 | 1,2,3,4,5,6 | 6 |
obrien-t j | 1 | grid.1002.3 | 1 | 1 | 1,3,4 | 3 |
obrien-terence | 4 | grid.416153.4 | 2 | 2 | 1,2,3,4,5,6 | 6 |
obrien-terence | 2 | grid.1008.9 | 1 | 2 | 2,3,4 | 3 |
obrien-terence | 1 | grid.484852.7 | 3 | 2 | 2 | 1 |
obrien-terence j | 36 | grid.416153.4 | 9 | 3 | 1,2,3,4,5,6 | 6 |
obrien-terence j | 14 | grid.1002.3 | 1 | 3 | 1,3,4 | 3 |
obrien-terence j | 25 | grid.1008.9 | 2 | 3 | 2,3,4 | 3 |
obrien-terence j | 40 | grid.237081.f | 5 | 3 | 3,4,7 | 3 |
obrien-terence j | 9 | grid.1623.6 | 4 | 3 | 3,4 | 2 |
obrien-terence j | 1 | grid.267362.4 | 6 | 3 | 3,7 | 2 |
obrien-terence j | 3 | grid.1019.9 | 3 | 3 | 3 | 1 |
obrien-terence j | 2 | grid.414094.c | 7 | 3 | 3 | 1 |
obrien-terence j | 1 | grid.416060.5 | 8 | 3 | 3 | 1 |
obrien-terence j | 4 | grid.453219.8 | 10 | 3 | 3 | 1 |
obrien-terence j | 3 | grid.454055.5 | 11 | 3 | 3 | 1 |
obrien-terence j | 6 | grid.474069.8 | 12 | 3 | 3 | 1 |
obrien-terence j | 13 | grid.481253.9 | 13 | 3 | 3 | 1 |
obrien-terence john | 2 | grid.416153.4 | 5 | 4 | 1,2,3,4,5,6 | 6 |
obrien-terence john | 1 | grid.1002.3 | 1 | 4 | 1,3,4 | 3 |
obrien-terence john | 1 | grid.1008.9 | 2 | 4 | 2,3,4 | 3 |
obrien-terence john | 1 | grid.237081.f | 4 | 4 | 3,4,7 | 3 |
obrien-terence john | 1 | grid.1623.6 | 3 | 4 | 3,4 | 2 |
obrien-terrence | 2 | grid.416153.4 | 1 | 5 | 1,2,3,4,5,6 | 6 |
obrien-terrence j | 1 | grid.416153.4 | 1 | 6 | 1,2,3,4,5,6 | 6 |
obrien-terry | 2 | grid.237081.f | 2 | 7 | 3,4,7 | 3 |
obrien-terry | 1 | grid.267362.4 | 3 | 7 | 3,7 | 2 |
obrien-terry | 1 | grid.137628.9 | 1 | 7 | 7 | 1 |
obrien-timothy | 3 | grid.6142.1 | 2 | 8 | 8 | 1 |
obrien-timothy | 1 | grid.496867.2 | 1 | 8 | 8 | 1 |
The second part is take all the names with max(link_counts)
lname_forename_long | grid_id_ct | grid_id | name_seq | group_seq | links | link_counts |
---|---|---|---|---|---|---|
obrien-t j | 11 | grid.416153.4 | 2 | 1 | 1,2,3,4,5,6 | 6 |
obrien-terence | 4 | grid.416153.4 | 2 | 2 | 1,2,3,4,5,6 | 6 |
obrien-terence j | 36 | grid.416153.4 | 9 | 3 | 1,2,3,4,5,6 | 6 |
obrien-terence john | 2 | grid.416153.4 | 5 | 4 | 1,2,3,4,5,6 | 6 |
obrien-terrence | 2 | grid.416153.4 | 1 | 5 | 1,2,3,4,5,6 | 6 |
obrien-terrence j | 1 | grid.416153.4 | 1 | 6 | 1,2,3,4,5,6 | 6 |
Add the names that are not in max(link_counts) = 6 choose the nmaes with the highest grid_id_ct to give.
lname_forename_long | grid_id_ct | grid_id | name_seq | group_seq | links | link_counts |
---|---|---|---|---|---|---|
obrien-timothy | 3 | grid.6142.1 | 2 | 8 | 8 | 1 |
obrien-terry | 2 | grid.237081.f | 2 | 7 | 3,4,7 | 3 |
obrien-terrence j | 1 | grid.416153.4 | 1 | 6 | 1,2,3,4,5,6 | 6 |
obrien-terrence | 2 | grid.416153.4 | 1 | 5 | 1,2,3,4,5,6 | 6 |
obrien-terence john | 2 | grid.416153.4 | 5 | 4 | 1,2,3,4,5,6 | 6 |
obrien-terence j | 36 | grid.416153.4 | 9 | 3 | 1,2,3,4,5,6 | 6 |
obrien-terence | 4 | grid.416153.4 | 2 | 2 | 1,2,3,4,5,6 | 6 |
obrien-t j | 11 | grid.416153.4 | 2 | 1 | 1,2,3,4,5,6 | 6 |
If any of the new names can link to the link_counts = 6 update a column the links that can intersect.
lname_forename_long | grid_id_ct | grid_id | name_seq | group_seq | links | link_counts | is_intersect_links |
---|---|---|---|---|---|---|---|
obrien-timothy | 3 | grid.6142.1 | 2 | 8 | 8 | 1 | |
obrien-terry | 2 | grid.237081.f | 2 | 7 | 3,4,7 | 3 | 3,4 |
obrien-terrence j | 1 | grid.416153.4 | 1 | 6 | 1,2,3,4,5,6 | 6 | 3,4 |
obrien-terrence | 2 | grid.416153.4 | 1 | 5 | 1,2,3,4,5,6 | 6 | 3,4 |
obrien-terence john | 2 | grid.416153.4 | 5 | 4 | 1,2,3,4,5,6 | 6 | 3,4 |
obrien-terence j | 36 | grid.416153.4 | 9 | 3 | 1,2,3,4,5,6 | 6 | 3,4 |
obrien-terence | 4 | grid.416153.4 | 2 | 2 | 1,2,3,4,5,6 | 6 | 3,4 |
obrien-t j | 11 | grid.416153.4 | 2 | 1 | 1,2,3,4,5,6 | 6 | 3,4 |
Since we can now link obrien-terry to the other obrien-t..... names update his grid_id to be the same as obrien-t..... grid.416153.4
lname_forename_long | grid_id_ct | grid_id | name_seq | group_seq | links | link_counts | is_intersect_links | is_merged |
---|---|---|---|---|---|---|---|---|
obrien-timothy | 3 | grid.6142.1 | 2 | 8 | 8 | 1 | '' | FALSE |
obrien-terry | 2 | grid.416153.4 | 2 | 7 | 3,4,7 | 3 | 3,4 | TRUE |
obrien-terrence j | 1 | grid.416153.4 | 1 | 6 | 1,2,3,4,5,6 | 6 | 3,4 | FALSE |
obrien-terrence | 2 | grid.416153.4 | 1 | 5 | 1,2,3,4,5,6 | 6 | 3,4 | FALSE |
obrien-terence john | 2 | grid.416153.4 | 5 | 4 | 1,2,3,4,5,6 | 6 | 3,4 | FALSE |
obrien-terence j | 36 | grid.416153.4 | 9 | 3 | 1,2,3,4,5,6 | 6 | 3,4 | FALSE |
obrien-terence | 4 | grid.416153.4 | 2 | 2 | 1,2,3,4,5,6 | 6 | 3,4 | FALSE |
obrien-t j | 11 | grid.416153.4 | 2 | 1 | 1,2,3,4,5,6 | 6 | 3,4 | FALSE |
I also added is_merged to indicate that a grid_id was updated. I have added multiple steps to make it clear but its possible its one or two steps. I have tried multiple ways of doing this using cartesain joins, intersect distinct to find a common grid between names but they all came up short. In simple terms I am trying to find figure out how many unique obriens I have based on being able to assign them to a common grid_id which is basically an address.
I'm not sure if I have over complicated it with all the intermediate steps. I don't need all the meta data columns I just need to end up with .
lname_forename_long | grid_id | is_merged |
---|---|---|
obrien-timothy | grid.6142.1 | FALSE |
obrien-terry | grid.416153.4 | TRUE |
obrien-terrence j | grid.416153.4 | FALSE |
obrien-terrence | grid.416153.4 | FALSE |
obrien-terence john | grid.416153.4 | FALSE |
obrien-terence j | grid.416153.4 | FALSE |
obrien-terence | grid.416153.4 | FALSE |
obrien-t j | grid.416153.4 | FALSE |
MY effort for samuel.
with t2 as (
with t1 as
(
Select "o'brien-t j" lname,11 grid_ct ,'grid.416153.4' grid_id,2 name_seq ,1 group_seq ,'1,2,3,4,5,6' links UNION ALL
Select "o'brien-terence",2,'grid.1008.9',1,2,'' UNION ALL
Select "o'brien-terence",4,'grid.416153.4',2,2,'' UNION ALL
Select "o'brien-terence",1,'grid.484852.7',3,2,'1,2,3,4,5,6' UNION ALL
Select "o'brien-terence j",14,'grid.1002.3',1,3,'3,7' UNION ALL
Select "o'brien-terence j",25,'grid.1008.9',2,3,'' UNION ALL
Select "o'brien-terence j",3,'grid.1019.9',3,3,'' UNION ALL
Select "o'brien-terence j",9,'grid.1623.6',4,3,'' UNION ALL
Select "o'brien-terence j",40,'grid.237081.f',5,3,'' UNION ALL
Select "o'brien-terence j",1,'grid.267362.4',6,3,'' UNION ALL
Select "o'brien-terence j",2,'grid.414094.c',7,3,'' UNION ALL
Select "o'brien-terence j",1,'grid.416060.5',8,3,'' UNION ALL
Select "o'brien-terence j",36,'grid.416153.4',9,3,'' UNION ALL
Select "o'brien-terence j",4,'grid.453219.8',10,3,'' UNION ALL
Select "o'brien-terence j",3,'grid.454055.5',11,3,'' UNION ALL
Select "o'brien-terence j",6,'grid.474069.8',12,3,'1,2,3,4,5,6' UNION ALL
Select "o'brien-terence j",13,'grid.481253.9',13,3,'3,4' UNION ALL
Select "o'brien-terence john",1,'grid.1002.3',1,4,'' UNION ALL
Select "o'brien-terence john",1,'grid.1008.9',2,4,'' UNION ALL
Select "o'brien-terence john",1,'grid.1623.6',3,4,'' UNION ALL
Select "o'brien-terence john",1,'grid.237081.f',4,4,'3,4' UNION ALL
Select "o'brien-terence john",2,'grid.416153.4',5,4,'1,2,3,4,5,6' UNION ALL
Select "o'brien-terrence",2,'grid.416153.4',1,5,'1,2,3,4,5,6' UNION ALL
Select "o'brien-terrence j",1,'grid.416153.4',1,6,'1,2,3,4,5,6' UNION ALL
Select "o'brien-terry",1,'grid.137628.9',1,7,'' UNION ALL
Select "o'brien-terry",2,'grid.237081.f',2,7,'3,7' UNION ALL
Select "o'brien-terry",1,'grid.267362.4',3,7,'' UNION ALL
Select "o'brien-timothy",1,'grid.496867.2',1,8,'' UNION ALL
Select "o'brien-timothy",3,'grid.6142.1',2,8,''
)
select distinct a.lname, a.grid_id
from t1 a, t1 b
where a.lname <> b.lname
and a.grid_id = b.grid_id
)
select distinct lname,
grid_id ,
DENSE_RANK() OVER
(
--PARTITION BY a.lname_init1
ORDER BY grid_id
) seq_num,
from t2
)
select
'matched' is_matched,
lname
,grid_id
,seq_num
from t3
group by lname ,grid_id,seq_num
having seq_num = (select max(seq_num )x from t3)
------------------------------------------
union all
--intersect distinct
------------------------------------------
select
'not_matched' is_matched,
lname
,grid_id
,seq_num
from t3
group by lname ,grid_id,seq_num
having seq_num != (select max(seq_num )x from t3);
My result. I could not figure out how to merge o'brien-terry to the matched group. It also missed o'brien-timothy
is_matched | lname | grid_id | seq_num |
---|---|---|---|
not_matched | o'brien-terence j | grid.1002.3 | 1 |
not_matched | o'brien-terence john | grid.1002.3 | 1 |
not_matched | o'brien-terence | grid.1008.9 | 2 |
not_matched | o'brien-terence j | grid.1008.9 | 2 |
not_matched | o'brien-terence john | grid.1008.9 | 2 |
not_matched | o'brien-terence j | grid.1623.6 | 3 |
not_matched | o'brien-terence john | grid.1623.6 | 3 |
not_matched | o'brien-terence j | grid.237081.f | 4 |
not_matched | o'brien-terence john | grid.237081.f | 4 |
not_matched | o'brien-terry | grid.237081.f | 4 |
not_matched | o'brien-terence j | grid.267362.4 | 5 |
not_matched | o'brien-terry | grid.267362.4 | 5 |
matched | o'brien-t j | grid.416153.4 | 6 |
matched | o'brien-terence | grid.416153.4 | 6 |
matched | o'brien-terence j | grid.416153.4 | 6 |
matched | o'brien-terence john | grid.416153.4 | 6 |
matched | o'brien-terrence | grid.416153.4 | 6 |
matched | o'brien-terrence j | grid.416153.4 | 6 |
Samuel result.
lname_forename_long | grid_id_ct | grid_id | name_seq | group_seq | links | link_counts | is_intersect_links |
---|---|---|---|---|---|---|---|
obrien-t j | 1 | grid.1002.3 | 1 | 1 | 1,3,4 | 3 | 1,3,4 |
obrien-terence | 2 | grid.1008.9 | 1 | 2 | 2,3,4 | 3 | 2,3,4 |
obrien-terence j | 14 | grid.1002.3 | 1 | 3 | 1,3,4 | 3 | 1,3,4 |
obrien-terence john | 1 | grid.1002.3 | 1 | 4 | 1,3,4 | 3 | 1,3,4 |
obrien-terry | 2 | grid.237081.f | 2 | 7 | 3,4,7 | 3 | |
obrien-timothy | 1 | grid.496867.2 | 1 | 8 | 8 | 1 |
Consider below approach
with temp as (
select *, array_length(split(links)) link_counts,
array_length(split(links)) < max(array_length(split(links))) over() merge_candidate
from (
select *, if(count(*) over win > 1, string_agg('' || group_seq) over win, '') links
from t1
window win as (partition by grid_id)
)
qualify 1 = row_number() over(partition by group_seq order by array_length(split(links)) desc, grid_id_ct desc)
)
select lname_forename_long, grid_id, merge_candidate as is_merged
from temp where not merge_candidate
union all
select lname_forename_long, ifnull(merged_grid_id, grid_id), if(merged_grid_id is null, false, true)
from (
select any_value(t1).*,
any_value(( select t2.grid_id
from unnest(split(t1.links)) link
join unnest(split(t2.links)) link
using(link)
limit 1
)) as merged_grid_id
from (select * from temp where merge_candidate) t1
cross join (select * from temp where not merge_candidate) t2
group by to_json_string(t1)
)
order by grid_id desc, lname_forename_long desc
if applied to sample data in your question - output is