I need to clusterize list of unique pairs of id which intersect between each other. Simplest example:
data_rows=[
{'clid':1,'uid':'a'},
{'clid':2,'uid':'b'},
{'clid':3,'uid':'b'},
{'clid':4,'uid':'c'},
{'clid':4,'uid':'x'},
{'clid':5,'uid':'d'},
{'clid':5,'uid':'e'},
{'clid':6,'uid':'d'},
{'clid':6,'uid':'e'},
{'clid':6,'uid':'f'},
]
# I transform data to this to this.
clid_dic = {1:['a'], 2:['b'],3:['b'], 4:['c','x'],5:['d','e'],6:['d','e','f']}
uid_dic = {'a':[1], 'b':[2,3], 'c':[4], 'x':[4], 'd':[5,6], 'e':[5,6], 'f':[6]}
# expected result
eid_dic = {
1:{'clids':[1],'uids':['a']},
2:{'clids':[2,3],'uids':['b']},
3:{'clids':[4],'uids':['c','x']},
4:{'clids':[5,6],'uids':['d','e','f']}
}
I'm constantly failing to create solution which will work on bigger list of pairs(>500.000) and not fall into recursion or other errors. My solution looks like this:
def crawl(clid):
clids, uids = [],[]
# add current clid to batch
if clid not in clids: clids.append(clid)
#collect its uids
for uid in clid_dic[clid]:
if uid not in uids: uids.append(uid)
#remove from dic
del clid_dic[clid]
# collect new clids for each uid
for uid in uids:
if uid in uid_dic:
for cld in uid_dic[uid]:
if cld not in clids:
clids.append(cld)
# for each collected clid call self
for cld in clids:
if cld in clid_dic:
tmp = crawl(cld)
clids.extend([x for x in tmp[0] if x not in clids])
uids.extend([x for x in tmp[1] if x not in uids])
return(clids,uids)
def loop():
i = 0
eid_dic = {}
while True:
n = len(clid_dic)
if n == 0: break
i += 1
clid = next(iter(clid_dic))
tmp = crawl(clid)
clids, uids= tmp[0], tmp[1]
eid_dic[i] = {'clids': clids, 'uids': uids}
print(eid_dic)
if __name__ == '__main__':
loop()
It seems like basic task(to clusterize smth) but lack of experience and knowledge lead me to dead end.
If order of items is not a problem, here is one solution (using sets for clid_dic
and uid_dic
values):
data_rows=[
{'clid':1,'uid':'a'},
{'clid':2,'uid':'b'},
{'clid':3,'uid':'b'},
{'clid':4,'uid':'c'},
{'clid':4,'uid':'x'},
{'clid':5,'uid':'d'},
{'clid':5,'uid':'e'},
{'clid':6,'uid':'d'},
{'clid':6,'uid':'e'},
{'clid':6,'uid':'f'},
]
clid_dic = {}
uid_dic = {}
for d in data_rows:
clid_dic.setdefault(d['clid'], set()).add(d['uid'])
uid_dic.setdefault(d['uid'], set()).add(d['clid'])
eid_dic, cnt = {}, 1
while clid_dic:
k, v = clid_dic.popitem()
k = {k}
to_remove = []
for k2, v2 in clid_dic.items():
if v & v2:
v = v.union(v2)
to_remove.append(k2)
for k2 in to_remove:
del clid_dic[k2]
to_remove = []
for k2, v2 in uid_dic.items():
if k & v2:
k = k.union(v2)
to_remove.append(k2)
for k2 in to_remove:
del uid_dic[k2]
eid_dic[cnt] = {'clids':list(k), 'uids':list(v)}
cnt += 1
from pprint import pprint
pprint(eid_dic)
Prints:
{1: {'clids': [5, 6], 'uids': ['e', 'd', 'f']},
2: {'clids': [4], 'uids': ['c', 'x']},
3: {'clids': [2, 3], 'uids': ['b']},
4: {'clids': [1], 'uids': ['a']}}