Search code examples
pythonpython-2.7filenameschecksumfilesize

Comparing files, path to check filenames, filesize, and checksum md5 in python


I would like to compare files in a directory that has its path, filename, filesize, and md5 checksum. I got three functions that works fine for filename, filesize and md5 checksum when I checked them independently. I think the problem is how I set up another function to process a dictionary with items of a csv files. Here is the csv files to compare with.

|Path|Filename|File Size|Hash
|/var/tmp/test|test1.txt|257|2e6041635f72233f4cdf6fbfb0a8288e
|/var/tmp/test|text2.txt|68|d3428d5910f54270d62ff57ccd5ff52c
|/var/tmp/test|text3.txt|58|42e8b3cba5320e07745110b8b193f534
|/var/tmp/test|text4.xml|128|4acc96e6e8b9006722408e15e555d2c2
|/var/tmp/test|text5.csv|214|a7071c13195d8485b2fb4a68503cbd7a

I have tried to modify the md5, filename, filesize and how its looping through the directory but seems to have issues.

def csv_checksum(files, path):
    # Get column with delimiter
    csv.register_dialect('myDialect', delimiter = '|')

    csvDics = {}
    # Open file, read them, and output csv formatted
    with open(files, 'r') as f:
        reader = csv.reader(f, dialect='myDialect')
        for row in reader:
            if reader.line_num == 1:
                continue
            csvDic = {
                    'Directory': row[1],
                    'Filename': row[2],
                    'File Size': row[3],
                    'Hash': row[4]
            }
            csvDics.update(csvDic) 
            print(csvDics)
            comp_original(csvDics, path)

def comp_original(dic, path):
    for (dirpath, dirnames, filenames) in os.walk(path):
        for files in filenames:
            if (dic.get('Directory') == path
                    and dic.get('Filename') == get_filename(files)
                    and dic.get('File Size') == get_filesize(files)
                    and dic.get('Hash') == get_md5(files)):
                print("All files matches")
                return True

def get_filename(fname):
    filename = os.path.basename(fname)
    return filename

def get_filesize(fname):
    stat_info = os.stat(fname)
    file_size = stat_info.st_size
    return file_size

def get_md5(fname):
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(2 ** 20), b""):
            hash_md5.update(chunk)
            get_hash = hash_md5.hexdigest()
            return get_hash

For filename it goes through the loop but prints out 3 are not matching as No matches and one as All files matches which all should matched. Then for both filesize and get_md5, I get OSError: [Errno 2] No such file or directory: 'text3.txt'

For filename issue:

{'Directory': '/var/tmp/test', 'File Size': '257', 'Hash': '2e6041635f72233f4cdf6fbfb0a8288e', 'Filename': 'test1.txt'}
{'Directory': '/var/tmp/test', 'File Size': '68', 'Hash': 'd3428d5910f54270d62ff57ccd5ff52c', 'Filename': 'text2.txt'}
{'Directory': '/var/tmp/test', 'File Size': '58', 'Hash': '42e8b3cba5320e07745110b8b193f534', 'Filename': 'text3.txt'}
{'Directory': '/var/tmp/test', 'File Size': '128', 'Hash': '4acc96e6e8b9006722408e15e555d2c2', 'Filename': 'text4.xml'}
{'Directory': '/var/tmp/test', 'File Size': '214', 'Hash': 'a7071c13195d8485b2fb4a68503cbd7a', 'Filename': 'text5.csv'}
No matches
No matches
No matches
All files matches

For file size:

  File "./create_manifest.py", line 44, in csv_checksum
    comp_baseline_manifest(csvDics, path)
  File "./create_manifest.py", line 88, in comp_baseline_manifest
    and dic.get('File Size') == get_filesize(files)):
  File "./create_manifest.py", line 100, in get_filesize
    stat_info = os.stat(fname)
OSError: [Errno 2] No such file or directory: 'text3.txt'

For md5 error:

    comp_baseline_manifest(csvDics, path)
  File "./create_manifest.py", line 89, in comp_baseline_manifest
    and dic.get('Hash') == get_md5(files)):
  File "./create_manifest.py", line 107, in get_md5
    with open(fname, "rb") as f:
IOError: [Errno 2] No such file or directory: 'text3.txt'

Solution

  • Instead of this:

    for (dirpath, dirnames, filenames) in os.walk(path):
        for files in filenames:
            if (dic.get('Directory') == path
                    and dic.get('Filename') == get_filename(files)
                    and dic.get('File Size') == get_filesize(files)
                    and dic.get('Hash') == get_md5(files)):
    

    you should have used :

    for root, dirs, files in os.walk(path):
        for f in files:
            file_name = os.path.join( root, f )   # <<--- this is important
            if (dic.get('Directory') == path      # `root` here, not `path` ??
                    and dic.get('Filename') == get_filename(file_name)
                    and dic.get('File Size') == get_filesize(file_name)
                    and dic.get('Hash') == get_md5(file_name)):