I have some data that contains ratios of 5 elements 'a'
, 'b'
, 'c'
, 'd'
, 'e'
, which looks something like this:
data = [
{'a': 0.197, 'b': 0.201, 'c': 0.199, 'd': 0.202, 'e': 0.201},
{'a': 0.624, 'b': 0.628, 'c': 0.623, 'd': 0.625, 'e': 0.750},
{'a': 0.192, 'b': 0.203, 'c': 0.200, 'd': 0.202, 'e': 0.203},
{'a': 0.630, 'b': 0.620, 'c': 0.625, 'd': 0.623, 'e': 0.752},
]
I would like to hash each ratio data (represented as dict) into a string that can be used as a unique identifier for ratios with a tolerance. For example, with a tolerance of 0.1 for the ratio of each element, the expectation is that the first and third dicts should have the same identifier, and the second and fourth dicts should have the same identifier. This is easy to do if one just wants to compare if two ratio data are within the tolerance, but I am not sure how create unique identifiers.
Edit: I am looking for some rounding method, instead of completely arbitrary hashing.
How about simply flooring and concatenating?
data = [ {'a': 0.197, 'b': 0.201, 'c': 0.199, 'd': 0.202, 'e': 0.201}, {'a': 0.624, 'b': 0.628, 'c': 0.623, 'd': 0.625, 'e': 0.750}, {'a': 0.192, 'b': 0.203, 'c': 0.200, 'd': 0.202, 'e': 0.203}, {'a': 0.630, 'b': 0.620, 'c': 0.625, 'd': 0.623, 'e': 0.752}, ]
def hashwithtol(datum, abstol=0.1):
return ','.join(
str(int(datum[k] // abstol))
for k in 'abcde'
)
def groupby_hashwithtol(data, abstol=0.1):
groups = {}
for datum in data:
groups.setdefault(hashwithtol(datum, abstol), []).append(datum)
return groups
for abstol in (1, 0.1, 0.01):
print(f'Abs tol = {abstol}')
groups = groupby_hashwithtol(data, abstol)
print(*(f'{k}: {g}' for k,g in groups.items()), sep='\n')
print()
Abs tol = 1
0,0,0,0,0: [{'a': 0.197, 'b': 0.201, 'c': 0.199, 'd': 0.202, 'e': 0.201}, {'a': 0.624, 'b': 0.628, 'c': 0.623, 'd': 0.625, 'e': 0.75}, {'a': 0.192, 'b': 0.203, 'c': 0.2, 'd': 0.202, 'e': 0.203}, {'a': 0.63, 'b': 0.62, 'c': 0.625, 'd': 0.623, 'e': 0.752}]
Abs tol = 0.1
1,2,1,2,2: [{'a': 0.197, 'b': 0.201, 'c': 0.199, 'd': 0.202, 'e': 0.201}]
6,6,6,6,7: [{'a': 0.624, 'b': 0.628, 'c': 0.623, 'd': 0.625, 'e': 0.75}, {'a': 0.63, 'b': 0.62, 'c': 0.625, 'd': 0.623, 'e': 0.752}]
1,2,2,2,2: [{'a': 0.192, 'b': 0.203, 'c': 0.2, 'd': 0.202, 'e': 0.203}]
Abs tol = 0.01
19,20,19,20,20: [{'a': 0.197, 'b': 0.201, 'c': 0.199, 'd': 0.202, 'e': 0.201}]
62,62,62,62,74: [{'a': 0.624, 'b': 0.628, 'c': 0.623, 'd': 0.625, 'e': 0.75}]
19,20,20,20,20: [{'a': 0.192, 'b': 0.203, 'c': 0.2, 'd': 0.202, 'e': 0.203}]
62,61,62,62,75: [{'a': 0.63, 'b': 0.62, 'c': 0.625, 'd': 0.623, 'e': 0.752}]
int(datum[k] // abstol)
with int(round(datum[k] / abstol))
.hash
: def hashwithtol(datum, abstol=0.1): return hash(','.join(str(int(datum[k] // abstol)) for k in 'abcde'))