Search code examples
c++levenshtein-distance

Revise existing Levenshtein distance code to accommodate different operation costs


I have found a lot of sources that determine Levenshtein distances (LD) between two strings. However all of them assume the costs for substitution, insertion, and deletion operations are all set to 1.

This source for C++ is very efficient, which I am trying to adapt below to allow different costs per operation.

#include <vector>
#include <string>
#include <iostream>

size_t levenshtein_distance(const std::string &a, const std::string &b);

int main()
{
    std::string a, b;

    a = "roger"; b = "Roger";
    std::cout << levenshtein_distance(a, b) << std::endl;

    a = "roger"; b = "oger";
    std::cout << levenshtein_distance(a, b) << std::endl;

    a = "oger"; b = "roger";
    std::cout << levenshtein_distance(a, b) << std::endl;

    return 0;
}

size_t levenshtein_distance(const std::string &a, const std::string &b)
{
    // Costs of operations
    const size_t substitution = 5;
    const size_t deletion = 2;
    const size_t insertion = 2;

    size_t a_size = a.size(), b_size = b.size();

    std::vector<size_t> P(b_size + 1), Q(b_size + 1);

    for (size_t i = 0; i < Q.size(); i++)
        Q[i] = i;

    for (size_t i = 0; i < a_size; i++)
    {
        P[0] = i + 1;

        for (size_t j = 0; j < b_size; j++)
            P[j + 1] = std::min(
                std::min(Q[j + 1] + 1, P[j] + 1), Q[j] + ((a[i] == b[j])? 0: substitution));

        P.swap(Q);
    }

    return Q[b_size];
}

I think I have substitution in the correct place. If I change it to 5, it gives a correspondingly large LD for that operation, but can't seem to find where to apply insertion or deletion. I tried changing the literals 1, but they don't seem to have any effect - the result is always 1 for the insertion or deletion operations.


Solution

  • You could adapt the algorithm from wikipedia as following:

    size_t levenshtein_distance(const std::string &s1, const std::string &s2)
    {
        const size_t substitution = 5;
        const size_t deletion = 2;
        const size_t insertion = 3;
    
        const size_t len1 = s1.size(), len2 = s2.size();
        vector<vector<unsigned int> > d(len1 + 1, vector<unsigned int>(len2 + 1));
        d[0][0] = 0;
        for(unsigned int i = 1; i <= len1; ++i) d[i][0] = deletion * i;
        for(unsigned int i = 1; i <= len2; ++i) d[0][i] = insertion * i;
    
        for(unsigned int i = 1; i <= len1; ++i)
            for(unsigned int j = 1; j <= len2; ++j)
                d[i][j] = std::min(
                    std::min(d[i - 1][j] + deletion, d[i][j - 1] + insertion),
                    d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : substitution)
                );
        return d[len1][len2];
    }
    

    For example d[i][0] means the cost of transforming the first i characters of first string to the zero first characters of the second string. This comes obviously from deletions and so d[i][0] = deletion * i. Similarly, d[0][i] = insertion * i.

    If you don't want to use the two dimensional array, then you can only keep the last row:

    int levenshtein_distance(const std::string &s1, const std::string &s2)
    {
        const int substitution = 5;
        const int deletion = 2;
        const int insertion = 3;
    
        const int len1 = s1.size(), len2 = s2.size();
        std::vector<int> P(len2 + 1), Q(len2 + 1);
        for(int j = 1; j <= len2; ++j) P[j] = insertion * j;
        for(int i = 1; i <= len1; ++i) {
            Q[0] = deletion * i;
            for(int j = 1; j <= len2; ++j) {
                Q[j] = std::min(Q[j - 1] + insertion, P[j] + deletion);
                Q[j] = std::min(Q[j], P[j - 1] +
                    (s1[i - 1] == s2[j - 1] ? 0 : substitution)
                );
            }
            std::swap(P, Q);
        }
        return P[len2];
    }