I'm a newbie in this field and am trying to learn a bit about how to write cryptographic hash functions.
To get some hands-on, I tried updating the PySHA2 algorithm for Python 3.6 and up (the original version doesn't work on Python 2.5+ and the author says he won't fix this). I don't intend to use this algorithm for any work, just coding this for the sake of knowledge.
I've reached this far:
import copy
import struct
_initial_hashes = [0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179]
_round_constants = [0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694,
0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4,
0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70,
0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b,
0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30,
0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b,
0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817]
def _rit_rot(on: int, by: int) -> int:
"""
helper function for right rotation as it isn't done by a simple bitwise operation (xor is done by '^')
:param on: value to be rotated
:param by: value by which to rotate
:return: right rotated 'on'
"""
return ((on >> by) | (on << (64 - by))) & 0xFFFFFFFFFFFFFFFF
def hash_main(chunk):
global _initial_hashes, _round_constants
# start the hashing process
# to begin, create a place to store the 80 words that we'll make
words = [0] * 80
# first 16 words will be saved without any changes
words[:16] = struct.unpack('!16Q', chunk)
# extend these 16 words into the remaining 64 words of 'message schedule array'
for i in range(16, 80):
part_1 = _rit_rot(words[i - 15], 1) ^ _rit_rot(words[i - 15], 8) ^ (words[i - 15] >> 7)
part_2 = _rit_rot(words[i - 2], 19) ^ _rit_rot(words[i - 2], 61) ^ (words[i - 2] >> 6)
words[i] = (words[i - 16] + part_1 + words[i - 7] + part_2) & 0xFFFFFFFFFFFFFFFF
# create the working variables
a, b, c, d, e, f, g, h = _initial_hashes
# start the compression function
for z in range(80):
var_1 = _rit_rot(a, 28) ^ _rit_rot(a, 34) ^ _rit_rot(a, 39)
var_2 = _rit_rot(e, 14) ^ _rit_rot(e, 18) ^ _rit_rot(e, 41)
var_3 = (a & b) ^ (a & c) ^ (b & c)
var_4 = (e & f) ^ ((~e) & g)
temp_1 = var_1 + var_3
temp_2 = h + var_2 + var_4 + _round_constants[z] + words[z]
# remix the hashes
h = g
g = f
f = e
e = (d + temp_2) & 0xFFFFFFFFFFFFFFFF
d = c
c = b
b = a
a = (temp_1 + temp_2) & 0xFFFFFFFFFFFFFFFF
# add this chunk to initial hashes
_initial_hashes = [(x + y) & 0xFFFFFFFFFFFFFFFF for x, y in zip(_initial_hashes,
[a, b, c, d, e, f, g, h])]
def _sha_backend_update(text_copy, _buffer, _counter):
"""
backend function that hashes given string
"""
global _initial_hashes, _round_constants
# create variables for cycling
_buffer += text_copy
_counter += len(text_copy)
# assert the variables are correct
if not text_copy:
return
if type(text_copy) is not str:
raise TypeError("Invalid Object! Please enter a valid string for hashing!")
# break the buffer into 128-bit chunks
while len(_buffer) >= 128:
chunk = _buffer[:128].encode()[1:]
hash_main(chunk)
_buffer = _buffer[128:]
def sha_backend_digest(text_to_hash: str, _buffer: str, _counter: int,
_output_size: int, hex_output: bool = False):
# initialize variables
variable_x = _counter & 0x7F
length = str(struct.pack('!Q', _counter << 3))
# set the thresholds
if variable_x < 112:
padding_len = 111 - variable_x
else:
padding_len = 239 - variable_x
# make a copy of the text_to_hash before starting hashing
text_copy = copy.deepcopy(text_to_hash)
m = '\x80' + ('\x00' * (padding_len + 8)) + length
# run the update function
_sha_backend_update(text_copy, _buffer, _counter)
# return the hash value
return_val = [hex(stuff) for stuff in _initial_hashes[:_output_size]]
if hex_output is True:
return_val = [int(stuff, base=16) for stuff in return_val]
return return_val
return ''.join(return_val)
def sha_512(text_to_hash: str, hex_digest: bool = False) -> str:
"""
frontend function for SHA512 hashing
:return: hashed string
"""
# before anything, check if the input is correct
if not text_to_hash:
return ""
if type(text_to_hash) is not str:
raise TypeError("Invalid content! Please provide content in correct format for hashing!")
# initialize default variables
_buffer = ''
_counter = 0
_output_size = 8
# start the backend function
return sha_backend_digest(text_to_hash, _buffer, _counter, _output_size, hex_output=hex_digest)
message = "This is a string to be hashed"
from hashlib import sha512
print("hashlib gives: ", sha512(message.encode()).hexdigest())
print("I give: ", sha_512(message))
As is obvious, I don't understand a lot of things in this algorithm and have literally copied many parts from the original code (also, I know it isn't good practice to write everything in a single function but I find it easier when trying to understand something).
But the biggest problem I have right now is it doesn't work! Whatever input message I provide to my function, it gives the same output:
0x6a09e667f3bcc9080xbb67ae8584caa73b0x3c6ef372fe94f82b0xa54ff53a5f1d36f1
0x510e527fade682d10x9b05688c2b3e6c1f0x1f83d9abfb41bd6b0x5be0cd19137e2179
I wrote a code at the bottom to compare it with python's hashlib
module.
Where am I going wrong in this and how do I fix this?
EDIT: As mentioned in the comments, I tried to feed in a longer message
string and the code seems to be working (it still gives longer output than hashlib
though):
message = "This is a string to be hashed. I'll try to make this string as long as possible by adding" \
"as much information to it as I can, in the hopes that this string would somehow become longer than" \
"128 bits and my code can run properly. Hopefully, this is already longer than 128 bits, so lets see" \
"how it works..."
hash: 0x6fcc0f346f2577800x334bd9b6c1178a970x90964a3f45f7b5bb0xc14033d12f6607e60xb598bea0a8b0ac1e0x116b0e134691ab540x73d88e77e5b862ba0x89181da7462c5574
message = "This is a string to be hashed. I'll try to make this string as long as possible by adding" \
"as much information to it as I can, in the hopes that this string would somehow become longer than"
hash: 0x166e40ab03bc98750xe81fe34168b6994f0xe56b81bd5972b5560x8789265c3a56b30b0x2c810d652ea7b1550xa23ca2704602a8240x12ffb1ec8f3dd6d10x88c29f84cbef8988
You'll always have to pad the message. Padding and adding the length are always required as last step of the SHA-2 process. Currently you weren't performing that last step (to completion).
Here are my last two comments that pointed you in the right direction:
So generally you try and take one 128 byte block from the binary message, update the hash state using the information in that block, then move to the next one until you have a partial or 0 byte block. That block you need to pad & add size indication (in bits) and process. If you've not enough space for the padding / size indication then you need yet another block consisting entirely of padding and the size indication. If you read carefully, then you always process at least one block.
and
Hmm, it is already in
sha_backend_digest
(the0x80
followed by zero bytes and the length which is input size * 8(_counter << 3)
.
But of course you do need to perform that and not skip any step.