Search code examples
pythonstringalgorithmdata-structuressuffix-array

How to find different k-grams of a string using a suffix array?


I'm trying to write a function (in Python) how many different K-grams a string has. How can I compute it using a suffix array and a LCP array (I already have them computed)?


Solution

  • Having LCP array lc, we should walk through it and count values less than k, whilst corresponding suffix have enough length (at least k, this checking requires suffix array sa value):

    from collections import defaultdict
    
    def sort_bucket(s, bucket, order):
        d = defaultdict(list)
        for i in bucket:
            key = s[i + order // 2:i + order]
            d[key].append(i)
        result = []
        for k, v in sorted(d.items()):
            if len(v) > 1:
                result += sort_bucket(s, v, 2 * order)
            else:
                result.append(v[0])
        return result
    
    
    def suffix_array_ManberMyers(s):
        return sort_bucket(s, range(len(s)), 1)
    
    def lcp_kasai(s, suffarr):
        n = len(suffarr)
        k = 0
        lcp = [0] * n
        rank = [0] * n
        for i in range(n):
            rank[suffarr[i]] = i
    
        for  i in range(n):
            if (k>0):
                k -= 1
            if(rank[i]==n-1):
                 k = 0
                 continue
            j = sa[rank[i]+1]
            while((i+k<n) & (j+k<n) & (s[i+k]==s[j+k])):
                k += 1
            lcp[rank[i]] = k
        return lcp
    
    def dumb(s, k):
        kg = set()
        for i in range(len(s)-k+1):
            kg.add(s[i:i+k])
        #print(sorted(kg))
        return len(kg)
    
    s = "ACGTTGCATGTCGCATGATGCATGAGAGCT$"
    sa = suffix_array_ManberMyers(s)
    print(sa)
    lc = lcp_kasai(s, sa)
    print(lc)
    k = 3
    distinct = 0
    for i in range(1, len(lc)):
        if (lc[i] < k) and (sa[i] + k < len(s)):
            distinct += 1
    print(dumb(s[:-1], k), distinct)
    

    Output:

    [30, 0, 24, 26, 21, 14, 17, 7, 20, 13, 6, 11, 1, 28, 23, 25, 16, 19, 12, 5, 27, 9, 2, 29, 10, 22, 15, 18, 4, 8, 3]
    [0, 1, 2, 1, 4, 3, 3, 0, 5, 4, 1, 2, 1, 0, 3, 2, 1, 6, 5, 2, 1, 2, 0, 1, 1, 3, 2, 6, 2, 1, 0]
    18 18
    

    dumb function is for testing