Search code examples
pythonpython-3.xalgorithmgraph-algorithm

Clusterizaztion in Python: How to group list of unique pairs of ids?


I need to clusterize list of unique pairs of id which intersect between each other. Simplest example:

data_rows=[
    {'clid':1,'uid':'a'},
    {'clid':2,'uid':'b'},
    {'clid':3,'uid':'b'},
    {'clid':4,'uid':'c'},
    {'clid':4,'uid':'x'},
    {'clid':5,'uid':'d'},
    {'clid':5,'uid':'e'},
    {'clid':6,'uid':'d'},
    {'clid':6,'uid':'e'},
    {'clid':6,'uid':'f'},
]
# I transform data to this to this.
clid_dic = {1:['a'], 2:['b'],3:['b'], 4:['c','x'],5:['d','e'],6:['d','e','f']}
uid_dic = {'a':[1], 'b':[2,3], 'c':[4], 'x':[4], 'd':[5,6], 'e':[5,6], 'f':[6]}
# expected result
eid_dic = {
    1:{'clids':[1],'uids':['a']},
    2:{'clids':[2,3],'uids':['b']},
    3:{'clids':[4],'uids':['c','x']},
    4:{'clids':[5,6],'uids':['d','e','f']}
}

I'm constantly failing to create solution which will work on bigger list of pairs(>500.000) and not fall into recursion or other errors. My solution looks like this:

def crawl(clid):
    clids, uids = [],[]
    # add current clid to batch
    if clid not in clids: clids.append(clid)

    #collect its uids
    for uid in clid_dic[clid]:
        if uid not in uids: uids.append(uid)

    #remove from dic
    del clid_dic[clid]

    # collect new clids for each uid
    for uid in uids:
        if uid in uid_dic:
            for cld in uid_dic[uid]:
                if cld not in clids:
                    clids.append(cld)
    # for each collected clid call self
    for cld in clids:
        if cld in clid_dic:
            tmp  = crawl(cld)
            clids.extend([x for x in tmp[0] if x not in clids])
            uids.extend([x for x in tmp[1] if x not in uids])
    return(clids,uids)

def loop():
    i = 0
    eid_dic = {}
    while True:
        n = len(clid_dic)
        if n == 0: break
        i += 1
        clid = next(iter(clid_dic))
        tmp = crawl(clid)
        clids, uids= tmp[0], tmp[1]

        eid_dic[i] = {'clids': clids, 'uids': uids}
    print(eid_dic)
if __name__ == '__main__':
    loop()

It seems like basic task(to clusterize smth) but lack of experience and knowledge lead me to dead end.


Solution

  • If order of items is not a problem, here is one solution (using sets for clid_dic and uid_dic values):

    data_rows=[
        {'clid':1,'uid':'a'},
        {'clid':2,'uid':'b'},
        {'clid':3,'uid':'b'},
        {'clid':4,'uid':'c'},
        {'clid':4,'uid':'x'},
        {'clid':5,'uid':'d'},
        {'clid':5,'uid':'e'},
        {'clid':6,'uid':'d'},
        {'clid':6,'uid':'e'},
        {'clid':6,'uid':'f'},
    ]
    
    clid_dic = {}
    uid_dic = {}
    for d in data_rows:
        clid_dic.setdefault(d['clid'], set()).add(d['uid'])
        uid_dic.setdefault(d['uid'], set()).add(d['clid'])
    
    eid_dic, cnt = {}, 1
    
    while clid_dic:
        k, v = clid_dic.popitem()
        k = {k}
    
        to_remove = []
        for k2, v2 in clid_dic.items():
            if v & v2:
                v = v.union(v2)
                to_remove.append(k2)
        for k2 in to_remove:
            del clid_dic[k2]
    
        to_remove = []
        for k2, v2 in uid_dic.items():
            if k & v2:
                k = k.union(v2)
                to_remove.append(k2)
        for k2 in to_remove:
            del uid_dic[k2]
    
        eid_dic[cnt] = {'clids':list(k), 'uids':list(v)}
        cnt += 1
    
    from pprint import pprint
    pprint(eid_dic)
    

    Prints:

    {1: {'clids': [5, 6], 'uids': ['e', 'd', 'f']},
     2: {'clids': [4], 'uids': ['c', 'x']},
     3: {'clids': [2, 3], 'uids': ['b']},
     4: {'clids': [1], 'uids': ['a']}}