I was looking to test the performance of MDLD for some in-browser string comparisions to be integrated into a web-app. The use-case involves comparing strings like, "300mm, Packed Wall" and "Packed Wall - 300mm", so I was looking for fuzzy string matching, that has some tolerance for punctuation and typos, as well as allowing block character transpositions.
I wasn't able to find an implementation online for Javascript. I found a version written for PL/SQL available at CSIRO's Taxamatch Wiki.
This was my attempt at converting the code in to JS; the results for the basic function seem fairly accurate, however, the block transposition calculation doesn't give the expected results. E.g. "Hi There" vs "There Hi" returns "6", regardless of what the block limit is set to.
If anyone knows of a working implementation, could you point me to it? Alternatively, what's the problem with my adaptation, or the source code itself? The only major change I made was to use "Math.ceil()" in two instances where the source appeared to use integer division, which would always take the floor-- It was causing odd issues for inputs that would result in 1 character strings-- but didn't seem to affect the behaviour of other cases I'd tested.
function mdld(str1, str2, block_lim)
{
mycol = [];
cost = 0;
len1 = str1.length;
len2 = str2.length;
if( str1 === str2 )
return 0;
else if ( len1 === 0 || len2 === 0 )
return Math.max(len1, len2);
else if ( len1 === 1 && len2 === 1 && str1 !== str2 )
return 1;
else
{
// Temporary strings which will be pre-processed
// Speeds up calcs & retains correct measurement
temp1 = str1;
temp2 = str2;
// Trim any common initial characters
while ( temp1.substr(0,1) === temp2.substr(0,1) )
{
temp1 = temp1.substr(1, temp1.length);
temp2 = temp2.substr(1, temp2.length);
}
// Trim any trailing characters
while ( temp1.substr(-1,1) === temp2.substr(-1,1) )
{
temp1 = temp1.substr(0,temp1.length-1);
temp2 = temp2.substr(0,temp2.length-1);
}
len1 = temp1.length;
len2 = temp2.length;
// Calc Levenshtein Distance
if (len1 === 0 || len2 === 0)
return Math.max(len1, len2);
else if (len1 === 1 && len2 === 1 && str1 !== str2)
return 1;
else
{
// Create columns
var s, t;
for(s = 0; s <= len1; s++)
mycol[s] = [];
// Enter values into leftmost column
for(t = 0; t <= len2; t++)
mycol[0][t] = t;
// Populate remaining columns
for(s = 1; s <= len1; s++)
{
mycol[s][0] = s;
// Populate first row (each cell of one column)
for(t = 1; t <= len2; t++)
{
//Calculate cost
if (temp1.substr(s-1,1) === temp2.substr(t-1,1))
cost = 0;
else
cost = 1;
// extension to cover multiple character transpositions
// that includes calculation of original Levenshtein distance when no transposition
tempBlockLen = Math.min( Math.ceil(len1/2), Math.ceil(len2/2), !block_lim ? 1 : block_lim );
while (tempBlockLen >= 1)
{
if ((s >= tempBlockLen * 2) &&
(t >= tempBlockLen * 2) &&
(temp1.substr(s-tempBlockLen*2, tempBlockLen) === temp2.substr(t-tempBlockLen, tempBlockLen)) &&
(temp1.substr(s-tempBlockLen, tempBlockLen) === temp2.substr(t-tempBlockLen*2, tempBlockLen)))
{
// Transposition found
mycol[s][t] = Math.min( mycol[s][t-1] + 1,
mycol[s-1][t] + 1,
mycol[s-tempBlockLen*2][t-tempBlockLen*2] + cost + tempBlockLen - 1 );
tempBlockLen = 0;
}
else if (tempBlockLen === 1)
{
// No Transposition
mycol[s][t] = Math.min( mycol[s][t-1] + 1,
mycol[s-1][t] + 1,
mycol[s-1][t-1] + cost );
}
tempBlockLen = tempBlockLen - 1;
}
}
}
}
return mycol[len1][len2];
}
}
In the end, I couldn't figure out what the issue was with my adaptation of the code from CSIRO. Found a github repo that implemented the function in C with Ruby extensions, https://github.com/GlobalNamesArchitecture/damerau-levenshtein.
Adapted that to get a functional implementation. Seems to work fine, but not great for my use case. MDLD can swap blocks of text, but only in circumstances where multiple consecutive swaps aren't needed to construct the source string. Going to look at N-Grams instead.
For those who are interested, this was my final result. Performance-wise, with a block limit of 5, it compared about 1000, 20-40 character strings in about 5 seconds.
function method_internal_distance(s, t, block_size, max_distance)
{
s = s.toLocaleLowerCase();
t = t.toLocaleLowerCase();
var pure_levenshtein = false;
var stop_execution = false;
var sl = s.length;
var tl = t.length;
var i, i1, j, j1, k, half_sl, half_tl, cost;
var distance, del, ins, subs, transp, block;
var current_distance = 0;
var min = 0;
if (block_size == 0)
{
pure_levenshtein = true;
block_size = 1;
}
if (sl == 0)
return tl;
if (tl == 0)
return sl;
// case of lengths 1 must present or it will break further in the code
if( sl == 1 && tl ==1 && s != t)
return 1;
// Increment length value of both strings, to include 0 for matrix
sl++;
tl++;
//one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
d = [];
//populate 'vertical' row starting from the 2nd position (first one is filled already)
for(i=0; i < tl; i++)
{
d[i*sl] = i;
}
//fill up array with scores
for(i = 1; i<sl; i++)
{
d[i] = i;
if (stop_execution)
break;
current_distance = 10000;
for(j=1; j<tl; j++)
{
cost = 1;
if(s[i-1] == t[j-1])
cost = 0;
half_sl = Math.floor((sl-1)/2);
half_tl = Math.floor((tl-1)/2);
block = block_size < half_sl || half_sl == 0 ? block_size : half_sl;
block = block < half_tl || half_tl == 0 ? block : half_tl;
while (block >=1)
{
var swap1 = 1;
var swap2 = 1;
i1 = i - (block * 2);
j1 = j - (block * 2);
for (k = i1; k < (i1 + block); k++)
{
if(s[k] != t[k+block])
{
swap = 0;
break;
}
}
for (k = j1; k < (j1 + block); k++)
{
if (t[k] != s[k+block])
{
swap2 = 0;
break;
}
}
del = d[j*sl + i - 1] + 1;
ins = d[(j-1)*sl + i] + 1;
min = del;
if (ins < min)
min = ins;
if (!pure_levenshtein && i >= 2*block && j >= 2*block && swap1 == 1 && swap2 == 1)
{
transp = d[(j-block*2)*sl+i-block*2] + cost + block - 1;
if (transp < min)
min = transp;
block = 0;
}
else if (block == 1)
{
subs = d[(j-1)*sl + i - 1] + cost;
if (subs < min)
min = subs;
}
block--;
}
d[j*sl+i] = min;
if (current_distance > d[j*sl+i])
current_distance = d[j*sl+i];
}
if (current_distance > max_distance)
stop_execution = true;
}
distance = d[sl*tl-1];
if (stop_execution)
distance = current_distance;
return distance;
}