I'm trying to write a script that produces the hash value for all filenames and directory names in a specified root directory. This is my script so far:
import hashlib
import os
import sys
class Hasher:
def __init__(self):
self.hash_func = hashlib.md5()
def hash_file(self, file_path):
with open(file_path, "rb") as file:
self.hash_func.update(file.read())
return self.hash_func.digest()
def hash_dir(self, dir_path):
for dirpath, dirnames, filenames in os.walk(dir_path):
self.hash_func.update(dirpath.encode("utf-8"))
for file_path in filenames:
self.hash_func.update(file_path.encode("utf-8"))
return self.hash_func.digest()
hasher = Hasher()
root_dir = "D:/folder/"
hash_1 = str(hasher.hash_dir(root_dir))
hash_2 = str(hasher.hash_dir(root_dir))
print(hash_1)
print(hash_2)
For some reason, it produces two different hash values for the same directory, without any change in the directory whatsoever. How can I make it so that the same hash value is produced if the directory stays the same?
The problem is that the hashlib.md5
object is reused every time, so you return a hash of the cumulative data, not only over the last/intended data.
You could solve this by creating a new Hasher
object every time (so calling Hasher().hash_dir(root_dir)
twice in this case). But since your Hasher
class does not contain any other data than the md5
object and two methods that could be static, I would recommend to make both class methods static, and create the hashlib.md5
object in the methods itself:
import hashlib
import os
class Hasher:
@staticmethod # make it a static method
def hash_file(file_path): # no 'self' as first argument
hash_func = hashlib.md5() # create the hashlib.md5 object here
with open(file_path, "rb") as file:
hash_func.update(file.read())
return hash_func.digest()
@staticmethod # make it a static method
def hash_dir(dir_path): # no 'self' as first argument
hash_func = hashlib.md5() # create the hashlib.md5 object here
for dirpath, _, filenames in os.walk(dir_path):
hash_func.update(dirpath.encode("utf-8"))
for file_path in filenames:
hash_func.update(file_path.encode("utf-8"))
return hash_func.digest()
def main():
root_dir = "D:/folder/"
hash_1 = str(Hasher.hash_dir(root_dir))
hash_2 = str(Hasher.hash_dir(root_dir))
print(hash_1)
print(hash_2)
if __name__ == "__main__":
main()