Given two strings u
and v
we can compute the edit distance using the popular Levenshtein algorithm. Using a method introduced in [1] by Sim et al. I was able to compute k
-approximate periods of strings in Python with the following code
def wagnerFischerTable(a, b):
D = [[0]]
[D.append([i]) for i, s in enumerate(a, 1)]
[D[0].append(j) for j, t in enumerate(b, 1)]
for j, s in enumerate(b, 1):
for i, t in enumerate(a, 1):
if s == t:
D[i].append(D[i-1][j-1])
else:
D[i].append(
min(
D[i-1][j] + 1,
D[i][j-1] + 1,
D[i-1][j-1] +1
)
)
return D
def simEtAlTables(s, p):
D = []
for i in xrange(len(s)):
D.append(wagnerFischerTable(p, s[i:]))
return D
def approx(s, p):
D = simEtAlTables(s, p)
t = [0]
for i in xrange(1, len(s)+1):
cmin = 9000
for h in xrange(0, i):
cmin = min(
cmin,
max(t[h], D[h][-1][i-h])
)
t.append(cmin)
return t[len(s)]
I wanted to port this to F# however I wasn't successful yet and I am looking forward to get some feedback what might be wrong.
let inline min3 x y z =
min (min x y) z
let wagnerFischerTable (u: string) (v: string) =
let m = u.Length
let n = v.Length
let d = Array2D.create (m + 1) (n + 1) 0
for i = 0 to m do d.[i, 0] <- i
for j = 0 to n do d.[0, j] <- j
for j = 1 to n do
for i = 1 to m do
if u.[i-1] = v.[j-1] then
d.[i, j] <- d.[i-1, j-1]
else
d.[i, j] <-
min3
(d.[i-1, j ] + 1) // a deletion
(d.[i , j-1] + 1) // an insertion
(d.[i-1, j-1] + 1) // a substitution
d
let simEtAlTables (u: string) (v: string) =
let rec tabulate n lst =
if n <> u.Length then
tabulate (n+1) (lst @ [wagnerFischerTable (u.Substring(n)) v])
else
lst
tabulate 0 []
let approx (u: string) (v: string) =
let tables = simEtAlTables u v
let rec kApprox i (ks: int list) =
if i = u.Length + 1 then
ks
else
let mutable curMin = 9000
for h = 0 to i-1 do
curMin <- min curMin (max (ks.Item h) ((tables.Item h).[i-h, v.Length - 1]))
kApprox (i+1) (ks @ [curMin])
List.head (List.rev (kApprox 1 [0]))
The reason why it "doesn't work" is just that I am getting wrong values. The Python code passes all test cases while the F# code fails every test. I presume that I have errors in the functions simEtAlTables
and/or approx
. Probably something with the indices, especially accessing the three dimensional list of table in approx
.
So here are three test cases which should cover different results:
Test 1: approx "abcdabcabb" "abc" -> 1
Test 2: approx "abababababab" "ab" -> 0
Test 3: approx "abcdefghijklmn" "xyz" -> 3
[1] http://www.lirmm.fr/~rivals/ALGOSEQ/DOC/SimApprPeriodsTCS262.pdf
This isn't functional in the least (neither is your Python solution), but here's a more direct translation to F#. Maybe you can use it as a starting point and make it more functional from there (although I'll hazard a guess it won't improve performance).
let wagnerFischerTable (a: string) (b: string) =
let d = ResizeArray([ResizeArray([0])])
for i = 1 to a.Length do d.Add(ResizeArray([i]))
for j = 1 to b.Length do d.[0].Add(j)
for j = 1 to b.Length do
for i = 1 to a.Length do
let s, t = b.[j-1], a.[i-1]
if s = t then
d.[i].Add(d.[i-1].[j-1])
else
d.[i].Add(
Seq.min [
d.[i-1].[j] + 1
d.[i].[j-1] + 1
d.[i-1].[j-1] + 1
])
d
let simEtAlTables (s: string) (p: string) =
let d = ResizeArray()
for i = 0 to s.Length - 1 do
d.Add(wagnerFischerTable p s.[i..])
d
let approx (s: string) (p: string) =
let d = simEtAlTables s p
let t = ResizeArray([0])
for i = 1 to s.Length do
let mutable cmin = 9000
for h = 0 to i - 1 do
let dh = d.[h]
cmin <- min cmin (max t.[h] dh.[dh.Count-1].[i-h])
t.Add(cmin)
t.[s.Length]