Search code examples
pythonstringf#levenshtein-distance

Approximate periods of strings - port Python code to F#


Given two strings u and v we can compute the edit distance using the popular Levenshtein algorithm. Using a method introduced in [1] by Sim et al. I was able to compute k-approximate periods of strings in Python with the following code

def wagnerFischerTable(a, b):
    D = [[0]]
    [D.append([i]) for i, s in enumerate(a, 1)]
    [D[0].append(j) for j, t in enumerate(b, 1)]

    for j, s in enumerate(b, 1):
        for i, t in enumerate(a, 1):
            if s == t:
                D[i].append(D[i-1][j-1])
            else:
                D[i].append(
                    min(
                        D[i-1][j] + 1,
                        D[i][j-1] + 1,
                        D[i-1][j-1] +1
                    )
                )
    return D

def simEtAlTables(s, p):
    D = []
    for i in xrange(len(s)):
        D.append(wagnerFischerTable(p, s[i:]))
    return D

def approx(s, p):
    D = simEtAlTables(s, p)
    t = [0]
    for i in xrange(1, len(s)+1):
        cmin = 9000
        for h in xrange(0, i):
            cmin = min(
                cmin,
                max(t[h], D[h][-1][i-h])
            )
        t.append(cmin)
    return t[len(s)]

I wanted to port this to F# however I wasn't successful yet and I am looking forward to get some feedback what might be wrong.

let inline min3 x y z = 
    min (min x y) z

let wagnerFischerTable (u: string) (v: string) =
    let m = u.Length
    let n = v.Length
    let d = Array2D.create (m + 1) (n + 1) 0

    for i = 0 to m do d.[i, 0] <- i
    for j = 0 to n do d.[0, j] <- j    

    for j = 1 to n do
        for i = 1 to m do
            if u.[i-1] = v.[j-1] then
                d.[i, j] <- d.[i-1, j-1]
            else
                d.[i, j] <-
                    min3
                        (d.[i-1, j  ] + 1) // a deletion
                        (d.[i  , j-1] + 1) // an insertion
                        (d.[i-1, j-1] + 1) // a substitution
    d

let simEtAlTables (u: string) (v: string) =
    let rec tabulate n lst =
        if n <> u.Length then
            tabulate (n+1) (lst @ [wagnerFischerTable (u.Substring(n)) v])
        else
            lst
    tabulate 0 []

let approx (u: string) (v: string) =
    let tables = simEtAlTables u v
    let rec kApprox i (ks: int list) =
        if i = u.Length + 1 then
            ks
        else
            let mutable curMin = 9000
            for h = 0 to i-1 do
                curMin <- min curMin (max (ks.Item h) ((tables.Item h).[i-h, v.Length - 1]))
            kApprox (i+1) (ks @ [curMin])
    List.head (List.rev (kApprox 1 [0]))

The reason why it "doesn't work" is just that I am getting wrong values. The Python code passes all test cases while the F# code fails every test. I presume that I have errors in the functions simEtAlTables and/or approx. Probably something with the indices, especially accessing the three dimensional list of table in approx.

So here are three test cases which should cover different results:

Test 1: approx "abcdabcabb" "abc" -> 1
Test 2: approx "abababababab" "ab" -> 0
Test 3: approx "abcdefghijklmn" "xyz" -> 3

[1] http://www.lirmm.fr/~rivals/ALGOSEQ/DOC/SimApprPeriodsTCS262.pdf


Solution

  • This isn't functional in the least (neither is your Python solution), but here's a more direct translation to F#. Maybe you can use it as a starting point and make it more functional from there (although I'll hazard a guess it won't improve performance).

    let wagnerFischerTable (a: string) (b: string) =
      let d = ResizeArray([ResizeArray([0])])
      for i = 1 to a.Length do d.Add(ResizeArray([i]))
      for j = 1 to b.Length do d.[0].Add(j)
      for j = 1 to b.Length do
        for i = 1 to a.Length do
          let s, t = b.[j-1], a.[i-1]
          if s = t then
            d.[i].Add(d.[i-1].[j-1])
          else
            d.[i].Add(
              Seq.min [
                d.[i-1].[j] + 1
                d.[i].[j-1] + 1
                d.[i-1].[j-1] + 1
              ])
      d
    
    let simEtAlTables (s: string) (p: string) =
      let d = ResizeArray()
      for i = 0 to s.Length - 1 do
        d.Add(wagnerFischerTable p s.[i..])
      d
    
    let approx (s: string) (p: string) =
      let d = simEtAlTables s p
      let t = ResizeArray([0])
      for i = 1 to s.Length do
        let mutable cmin = 9000
        for h = 0 to i - 1 do
          let dh = d.[h]
          cmin <- min cmin (max t.[h] dh.[dh.Count-1].[i-h])
        t.Add(cmin)
      t.[s.Length]