I am trying to solve leetcode problem 187. Repeated DNA Sequences using Rabin Karp algorithm with rolling hash approach. At first, I solved the problem without using any MOD operations like below.
class Solution:
def calculate_hash(self, prime, text):
hash_value = 0
for i in range(len(text)):
hash_value = hash_value + (ord(text[i]) * pow(prime, i))
return hash_value
def recalculate_hash(self, prime, old_hash, text, index, L):
new_hash = old_hash - ord(text[index - 1])
new_hash /= prime
new_hash = new_hash + (ord(text[index + L - 1]) * pow(prime, L - 1))
return new_hash
def findRepeatedDnaSequences(self, s: str) -> List[str]:
L, s_len = 10, len(s)
if s_len <= L:
return []
prime = 7
seen, res = set(), set()
old_hash = self.calculate_hash(prime, s[0:L])
seen.add(old_hash)
for i in range(1, s_len - L + 1):
new_hash = self.recalculate_hash(prime, old_hash, s, i, L)
if new_hash in seen:
res.add(s[i:i+L])
seen.add(new_hash)
old_hash = new_hash
return list(res)
In the above approach, I used the little endian approach and I had to use division during the calculation of the rolling hash value. However, I came across to this answer in StackOverlow, where the big endian approach is suggested like below.
I tried to apply the approach but not getting desired answer. This is my tried code.
class Solution:
def calculate_hash(self, prime, text, L, MOD):
hash_value = 0
for i in range(len(text)):
p_power = pow(prime, L - i - 1, MOD)
hash_value = (hash_value + (ord(text[i]) * p_power)) % MOD
return hash_value
def recalculate_hash(self, prime, old_hash, text, index, L, MOD):
p_power = pow(prime, L - 1, MOD)
new_hash = (old_hash * prime)
new_hash = (new_hash - (ord(text[index - 1]) * p_power) + ord(text[index + L - 1])) % MOD
return new_hash
def findRepeatedDnaSequences(self, s: str) -> List[str]:
L, s_len = 10, len(s)
if s_len <= L:
return []
prime = 7
MOD = 2**31 - 1
seen, res = set(), set()
old_hash = self.calculate_hash(prime, s[0:L], L, MOD)
seen.add(old_hash)
for i in range(1, s_len - L + 1):
new_hash = self.recalculate_hash(prime, old_hash, s, i, L, MOD)
if new_hash in seen:
res.add(s[i:i+L])
seen.add(new_hash)
old_hash = new_hash
return list(res)
What am I missing here during the MOD calculation?
def recalculate_hash(self, prime, old_hash, text, index, L, MOD):
p_power = pow(prime, L - 1, MOD)
new_hash = (old_hash * prime)
new_hash = (new_hash - (ord(text[index - 1]) * p_power) + ord(text[index + L - 1])) % MOD
return new_hash
The power that ord(text[index - 1])
has when you need to subtract it is actually L
, since the line before has already "moved it up" by one. pow(prime, L - 1, MOD)
should therefore be pow(prime, L, MOD)
- or alternatively, you can keep pow(prime, L - 1, MOD)
but subtract ord(text[index - 1]) * p_power
before multiplying by the prime
.
By the way in calculate_hash
, you don't need to calculate the power from scratch every time, you can iteratively multiply by prime
in every iteration the same as in recalculate_hash
.