The following (simplified) json type of data defines a Contact:
{
id: number;
name: string;
phone: string;
email: string
}
There is the following group of data:
+---+----------+-------------+---------------------------+
|id | name | phone |email |
+---+----------+-------------+---------------------------+
|1 | John | 11111111 |[email protected] |
|2 | Marc | 22222222 |[email protected] |
|3 | Ron | 99999999 |[email protected] |
|4 | Andrew | 55555555 |[email protected] |
|5 | Wim | 99999999 |[email protected] |
|6 | Marc | 33333333 |[email protected] |
|7 | Dan | 44444444 |[email protected] |
+---+----------+-------------+---------------------------+
Goal is to find the groups that belong together using javascript(optionally in lodash, but main idea is to get the algorithm clear) according to the following constraint: a contact belongs to a group when any of the following criteria are the same: name, phone or email. The results shows the id's grouped as arrays in arrays. a contact in a group of 1 is ignored.
In the example above, it means that contacts with ids 1,3,5 belong together since 1,3 share the same email and 3 and 5 share the same phonenumber. Likewise 2,6,7: 2 and 6 have the same name and 6 and 7 have the same email. 5 does not have anything in common.
The expected result therefore is:
[[1,3,5], [2,6,7]]
Background: One solution that works is iterating over every item and check for the remainder of the list if name, email, or phone is the same. If so, group them and take them out of the list (in the example we compare 1 with all the items in the list and only 3 is found). problem is that next items also need to be checked again to these groups, because in this case 5 is not yet detected as part of the group. This makes the algorithm complex, while I suspect there is a simple way of solving this in linear time. There might also be a name for this class of problems? `
Idea:
Union find is an efficient structure to handle merging of disjoint sets. Code taken from here. As it uses path compression and union by rank, you can consider that the whole code is linear in the amount of contacts.
var data = [
{id:1,name:'John',phone:'11111111',email:'[email protected]'},
{id:2,name:'Marc',phone:'99999999',email:'[email protected]'},
{id:3,name:'Ron',phone:'99999999',email:'[email protected]'},
{id:4,name:'Andrew',phone:'55555555',email:'[email protected]'},
{id:5,name:'Wim',phone:'99999999',email:'[email protected]'},
{id:6,name:'Marc',phone:'33333333',email:'[email protected]'},
{id:7,name:'Dan',phone:'44444444',email:'[email protected]'}
];
// UNION-FIND structure, with path comression and union by rank
var UNIONFIND = (function () {
function _find(n)
{
if(n.parent == n) return n;
n.parent = _find(n.parent);
return n.parent;
}
return {
makeset:function(id){
var newnode = {
parent: null,
id: id,
rank: 0
};
newnode.parent = newnode;
return newnode;
},
find: _find,
combine: function(n1, n2) {
var n1 = _find(n1);
var n2 = _find(n2);
if (n1 == n2) return;
if(n1.rank < n2.rank)
{
n2.parent = n2;
return n2;
}
else if(n2.rank < n1.rank)
{
n2.parent = n1;
return n1;
}
else
{
n2.parent = n1;
n1.rank += 1;
return n1;
}
}
};
})();
var groupHash = {name: {}, phone: {}, email: {}}
var groupNodes = []
data.forEach(function(contact){
var group = UNIONFIND.makeset(contact.id);
var groups = new Set();
["name", "phone", "email"].forEach(function(attr){
if (groupHash[attr].hasOwnProperty(contact[attr])) groups.add(groupHash[attr][contact[attr]])
});
groups = Array.from(groups);
groups.push(group);
groupNodes.push(group);
for(var i = 1; i < groups.length; i++) {
UNIONFIND.combine(groups[0], groups[i]);
}
["name", "phone", "email"].forEach(function(attr){
groupHash[attr][contact[attr]] = groups[0];
});
})
var contactsInGroup = {}
groupNodes.forEach(function(group){
var groupId = UNIONFIND.find(group).id;
if (contactsInGroup.hasOwnProperty(groupId) == false) {
contactsInGroup[groupId] = [];
}
contactsInGroup[groupId].push(group.id);
})
var result = Object.values(contactsInGroup).filter(function(list){
return list.length > 1
})
console.log(result)