I'm trying to write a function (in Python) how many different K-grams a string has. How can I compute it using a suffix array and a LCP array (I already have them computed)?
Having LCP array lc
, we should walk through it and count values less than k
, whilst corresponding suffix have enough length (at least k
, this checking requires suffix array sa
value):
from collections import defaultdict
def sort_bucket(s, bucket, order):
d = defaultdict(list)
for i in bucket:
key = s[i + order // 2:i + order]
d[key].append(i)
result = []
for k, v in sorted(d.items()):
if len(v) > 1:
result += sort_bucket(s, v, 2 * order)
else:
result.append(v[0])
return result
def suffix_array_ManberMyers(s):
return sort_bucket(s, range(len(s)), 1)
def lcp_kasai(s, suffarr):
n = len(suffarr)
k = 0
lcp = [0] * n
rank = [0] * n
for i in range(n):
rank[suffarr[i]] = i
for i in range(n):
if (k>0):
k -= 1
if(rank[i]==n-1):
k = 0
continue
j = sa[rank[i]+1]
while((i+k<n) & (j+k<n) & (s[i+k]==s[j+k])):
k += 1
lcp[rank[i]] = k
return lcp
def dumb(s, k):
kg = set()
for i in range(len(s)-k+1):
kg.add(s[i:i+k])
#print(sorted(kg))
return len(kg)
s = "ACGTTGCATGTCGCATGATGCATGAGAGCT$"
sa = suffix_array_ManberMyers(s)
print(sa)
lc = lcp_kasai(s, sa)
print(lc)
k = 3
distinct = 0
for i in range(1, len(lc)):
if (lc[i] < k) and (sa[i] + k < len(s)):
distinct += 1
print(dumb(s[:-1], k), distinct)
Output:
[30, 0, 24, 26, 21, 14, 17, 7, 20, 13, 6, 11, 1, 28, 23, 25, 16, 19, 12, 5, 27, 9, 2, 29, 10, 22, 15, 18, 4, 8, 3]
[0, 1, 2, 1, 4, 3, 3, 0, 5, 4, 1, 2, 1, 0, 3, 2, 1, 6, 5, 2, 1, 2, 0, 1, 1, 3, 2, 6, 2, 1, 0]
18 18
dumb
function is for testing