I have an array or objects, approx 58000 unique strings (partName).
parts = [
{ _id: 59a942a8c0b7467bf08711df, partName: '0' },
{ _id: 59a94299c0b7467bf084a917, partName: '9129' },
{ _id: 59a94299c0b7467bf084a918, partName: '9130' },
..,
.. ]
Here is the code. What I'm trying to create clusters of similar strings.
The code does this, but a partName that already is connected with a cluster can appears again in another cluster. I want to avoid this. One unique part name should only be connected to ONE cluster.
Here is an example I get:
{
"9129": [
"9132",
"9190",
"9279"
]
},
{
"9130": [
"9132",
"9180",
"9190",
"9430"
]
As you can see strings 9132
, 9190
are repeated in the following cluster. So my question is: How do I remove the strings 9132
, 9190
and 9279
after they have been connected with cluster 9129
?
function createCluster(arrayOfParts) {
let clusterArray = [];
for (var i = 0; i < 5; i++) {
let cluster = [];
y = 1;
console.log(arrayOfParts[i]);
for (var j = y; j < arrayOfParts.length; j++) {
if (
fuzzball.token_sort_ratio(
arrayOfParts[i].partName,
arrayOfParts[j].partName
) > "70"
) {
if (
arrayOfParts[i].partName.toLowerCase() !==
arrayOfParts[j].partName.toLowerCase() &&
!cluster.includes(arrayOfParts[j].partName)
) {
cluster.push(arrayOfParts[j].partName);
}
}
}
let obj = {};
obj[arrayOfParts[i].partName] = cluster.sort();
clusterArray.push(obj);
}
console.log("clusterArray", JSON.stringify(clusterArray, null, 2));
console.log("clusterArray.length", clusterArray.length);
}
You could define a bank of used strings outside the scope of the loop, add them and check if they have been added to there before adding them again?
Alternatively, if you have a finite (and potentially easily iterable) number of part names, you could hold them as keys with a true/false value to represent 'usable' that you could switch when used.
Solution (using a "bank of used strings" outside the scope of the loop):
function createCluster(arrayOfParts) {
let usedStrings = [];
let clusterArray = [];
for (var i = 0; i < 5; i++) {
let cluster = [];
y = 1;
if (usedStrings.includes(arrayOfParts[i].partName)) {
continue;
}
console.log(arrayOfParts[i]);
for (var j = y; j < arrayOfParts.length; j++) {
if (
fuzzball.token_sort_ratio(
arrayOfParts[i].partName,
arrayOfParts[j].partName
) > "70"
) {
if (
arrayOfParts[i].partName.toLowerCase() !==
arrayOfParts[j].partName.toLowerCase() &&
!cluster.includes(arrayOfParts[j].partName) &&
!usedStrings.includes(arrayOfParts[j].partName)
) {
cluster.push(arrayOfParts[j].partName);
usedStrings.push(arrayOfParts[j].partName);
}
}
}
let obj = {};
obj[arrayOfParts[i].partName] = cluster.sort();
clusterArray.push(obj);
}
console.log("clusterArray", JSON.stringify(clusterArray, null, 2));
console.log("clusterArray.length", clusterArray.length);
}