I have a set of images, and I would like to hash their data into an ID.
Currently I am doing this:
import hashlib
import uuid
def get_image_uuid(pil_img):
# Read PIL image data
img_bytes_ = pil_img.tobytes()
# hash the bytes using sha1
bytes_sha1 = hashlib.sha1(img_bytes_)
hashbytes_20 = bytes_sha1.digest()
# sha1 produces 20 bytes, but UUID requires 16 bytes
hashbytes_16 = hashbytes_20[0:16]
uuid_ = uuid.UUID(bytes=hashbytes_16)
return uuid_
This reads all the pixel data in the image, which is overkill for a deterministic 16 byte UUID hash.
Is there a way to do something like this?
img_bytes = pil_img.tobytes(stride=16)
EDIT: I produced some detailed timing results using this script. I should mention that the images that I'm using are large (about 6MB). I tested on windows and linux:
from __future__ import absolute_import, division, print_function
import __builtin__
import time
import timeit
from PIL import Image
import hashlib
import numpy as np
import uuid
# My data getters
from vtool.tests import grabdata
elephant = grabdata.get_testimg_path('elephant.jpg')
lena = grabdata.get_testimg_path('lena.jpg')
zebra = grabdata.get_testimg_path('zebra.jpg')
jeff = grabdata.get_testimg_path('jeff.png')
gpath = elephant
try:
getattr(__builtin__, 'profile')
__LINE_PROFILE__ = True
except AttributeError:
__LINE_PROFILE__ = False
def profile(func):
return func
@profile
def get_image_uuid(img_bytes_):
# hash the bytes using sha1
bytes_sha1 = hashlib.sha1(img_bytes_)
hashbytes_20 = bytes_sha1.digest()
# sha1 produces 20 bytes, but UUID requires 16 bytes
hashbytes_16 = hashbytes_20[0:16]
uuid_ = uuid.UUID(bytes=hashbytes_16)
return uuid_
@profile
def make_uuid_PIL_bytes(gpath):
pil_img = Image.open(gpath, 'r')
# Read PIL image data
img_bytes_ = pil_img.tobytes()
uuid_ = get_image_uuid(img_bytes_)
return uuid_
@profile
def make_uuid_NUMPY_bytes(gpath):
pil_img = Image.open(gpath, 'r')
# Read PIL image data
np_img = np.asarray(pil_img)
np_flat = np_img.ravel()
img_bytes_ = np_flat.tostring()
uuid_ = get_image_uuid(img_bytes_)
return uuid_
@profile
def make_uuid_NUMPY_STRIDE_16_bytes(gpath):
pil_img = Image.open(gpath, 'r')
# Read PIL image data
np_img = np.asarray(pil_img)
np_flat = np_img.ravel()[::16]
img_bytes_ = np_flat.tostring()
uuid_ = get_image_uuid(img_bytes_)
return uuid_
@profile
def make_uuid_NUMPY_STRIDE_64_bytes(gpath):
pil_img = Image.open(gpath, 'r')
# Read PIL image data
img_bytes_ = np.asarray(pil_img).ravel()[::64].tostring()
uuid_ = get_image_uuid(img_bytes_)
return uuid_
@profile
def make_uuid_CONTIG_NUMPY_bytes(gpath):
pil_img = Image.open(gpath, 'r')
# Read PIL image data
np_img = np.asarray(pil_img)
np_flat = np_img.ravel().tostring()
np_contig = np.ascontiguousarray(np_flat)
img_bytes_ = np_contig.tostring()
uuid_ = get_image_uuid(img_bytes_)
return uuid_
@profile
def make_uuid_CONTIG_NUMPY_STRIDE_16_bytes(gpath):
pil_img = Image.open(gpath, 'r')
# Read PIL image data
np_img = np.asarray(pil_img)
np_contig = np.ascontiguousarray(np_img.ravel()[::16])
img_bytes_ = np_contig.tostring()
uuid_ = get_image_uuid(img_bytes_)
return uuid_
@profile
def make_uuid_CONTIG_NUMPY_STRIDE_64_bytes(gpath):
pil_img = Image.open(gpath, 'r')
# Read PIL image data
img_bytes_ = np.ascontiguousarray(np.asarray(pil_img).ravel()[::64]).tostring()
uuid_ = get_image_uuid(img_bytes_)
return uuid_
if __name__ == '__main__':
# cool trick
test_funcs = [
make_uuid_PIL_bytes,
make_uuid_NUMPY_bytes,
make_uuid_NUMPY_STRIDE_16_bytes,
make_uuid_NUMPY_STRIDE_64_bytes,
make_uuid_CONTIG_NUMPY_bytes,
make_uuid_CONTIG_NUMPY_STRIDE_16_bytes,
make_uuid_CONTIG_NUMPY_STRIDE_64_bytes,
]
func_strs = ', '.join([func.func_name for func in test_funcs])
setup = 'from __main__ import (gpath, %s) ' % (func_strs,)
number = 2
for func in test_funcs:
func_name = func.func_name
print('Running: %s' % func_name)
if __LINE_PROFILE__:
start = time.time()
for _ in xrange(number):
func(gpath)
total_time = time.time() - start
else:
stmt = '%s(gpath)' % func_name
total_time = timeit.timeit(stmt=stmt, setup=setup, number=number)
print('timed: %r seconds in %s' % (total_time, func_name))
Here are the windows line profile results:
File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_STRIDE_16_bytes at line 91
Total time: 1.03287 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
91 @profile
92 def make_uuid_CONTIG_NUMPY_STRIDE_16_bytes(gpath):
93 2 3571 1785.5 0.1 pil_img = Image.open(gpath, 'r')
94 # Read PIL image data
95 2 3310103 1655051.5 96.2 np_img = np.asarray(pil_img)
96 2 44833 22416.5 1.3 np_contig = np.ascontiguousarray(np_img.ravel()
[::16])
97 2 9657 4828.5 0.3 img_bytes_ = np_contig.tostring()
98 2 72560 36280.0 2.1 uuid_ = get_image_uuid(img_bytes_)
99 2 4 2.0 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_STRIDE_64_bytes at line 102
Total time: 1.0385 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
102 @profile
103 def make_uuid_CONTIG_NUMPY_STRIDE_64_bytes(gpath):
104 2 3285 1642.5 0.1 pil_img = Image.open(gpath, 'r')
105 # Read PIL image data
106 2 3436641 1718320.5 99.3 img_bytes_ = np.ascontiguousarray(np.asarray(p
il_img).ravel()[::64]).tostring()
107 2 19570 9785.0 0.6 uuid_ = get_image_uuid(img_bytes_)
108 2 4 2.0 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_STRIDE_64_bytes at line 70
Total time: 1.04175 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
70 @profile
71 def make_uuid_NUMPY_STRIDE_64_bytes(gpath):
72 2 3356 1678.0 0.1 pil_img = Image.open(gpath, 'r')
73 # Read PIL image data
74 2 3447197 1723598.5 99.3 img_bytes_ = np.asarray(pil_img).ravel()[::64]
.tostring()
75 2 19774 9887.0 0.6 uuid_ = get_image_uuid(img_bytes_)
76 2 4 2.0 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_STRIDE_16_bytes at line 59
Total time: 1.0913 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
59 @profile
60 def make_uuid_NUMPY_STRIDE_16_bytes(gpath):
61 2 3706 1853.0 0.1 pil_img = Image.open(gpath, 'r')
62 # Read PIL image data
63 2 3339663 1669831.5 91.9 np_img = np.asarray(pil_img)
64 2 112 56.0 0.0 np_flat = np_img.ravel()[::16]
65 2 217844 108922.0 6.0 img_bytes_ = np_flat.tostring()
66 2 74044 37022.0 2.0 uuid_ = get_image_uuid(img_bytes_)
67 2 4 2.0 0.0 return uuid_
File: _timeits/time_uuids.py
Function: get_image_uuid at line 28
Total time: 1.10141 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
28 @profile
29 def get_image_uuid(img_bytes_):
30 # hash the bytes using sha1
31 14 3665965 261854.6 99.9 bytes_sha1 = hashlib.sha1(img_bytes_)
32 14 326 23.3 0.0 hashbytes_20 = bytes_sha1.digest()
33 # sha1 produces 20 bytes, but UUID requires 16
bytes
34 14 75 5.4 0.0 hashbytes_16 = hashbytes_20[0:16]
35 14 2661 190.1 0.1 uuid_ = uuid.UUID(bytes=hashbytes_16)
36 14 40 2.9 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_PIL_bytes at line 39
Total time: 1.33926 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
39 @profile
40 def make_uuid_PIL_bytes(gpath):
41 2 25940 12970.0 0.6 pil_img = Image.open(gpath, 'r')
42 # Read PIL image data
43 2 3277455 1638727.5 73.5 img_bytes_ = pil_img.tobytes()
44 2 1158009 579004.5 26.0 uuid_ = get_image_uuid(img_bytes_)
45 2 4 2.0 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_bytes at line 48
Total time: 1.39694 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
48 @profile
49 def make_uuid_NUMPY_bytes(gpath):
50 2 3406 1703.0 0.1 pil_img = Image.open(gpath, 'r')
51 # Read PIL image data
52 2 3344608 1672304.0 71.9 np_img = np.asarray(pil_img)
53 2 46 23.0 0.0 np_flat = np_img.ravel()
54 2 133593 66796.5 2.9 img_bytes_ = np_flat.tostring()
55 2 1171888 585944.0 25.2 uuid_ = get_image_uuid(img_bytes_)
56 2 5 2.5 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_bytes at line 79
Total time: 1.4899 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
79 @profile
80 def make_uuid_CONTIG_NUMPY_bytes(gpath):
81 2 3384 1692.0 0.1 pil_img = Image.open(gpath, 'r')
82 # Read PIL image data
83 2 3376051 1688025.5 68.0 np_img = np.asarray(pil_img)
84 2 133156 66578.0 2.7 np_flat = np_img.ravel().tostring()
85 2 146959 73479.5 3.0 np_contig = np.ascontiguousarray(np_flat)
86 2 149330 74665.0 3.0 img_bytes_ = np_contig.tostring()
87 2 1154328 577164.0 23.3 uuid_ = get_image_uuid(img_bytes_)
88 2 4 2.0 0.0 return uuid_
Here are the Linux line profile results:
File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_STRIDE_64_bytes at line 70
Total time: 0.456272 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
70 @profile
71 def make_uuid_NUMPY_STRIDE_64_bytes(gpath):
72 2 449 224.5 0.1 pil_img = Image.open(gpath, 'r')
73 # Read PIL image data
74 2 452880 226440.0 99.3 img_bytes_ = np.asarray(pil_img).ravel()[::64].
tostring()
75 2 2942 1471.0 0.6 uuid_ = get_image_uuid(img_bytes_)
76 2 1 0.5 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_STRIDE_64_bytes at line 102
Total time: 0.457588 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
102 @profile
103 def make_uuid_CONTIG_NUMPY_STRIDE_64_bytes(gpath):
104 2 445 222.5 0.1 pil_img = Image.open(gpath, 'r')
105 # Read PIL image data
106 2 454269 227134.5 99.3 img_bytes_ = np.ascontiguousarray(np.asarray(pi
l_img).ravel()[::64]).tostring()
107 2 2872 1436.0 0.6 uuid_ = get_image_uuid(img_bytes_)
108 2 2 1.0 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_STRIDE_16_bytes at line 91
Total time: 0.461928 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
91 @profile
92 def make_uuid_CONTIG_NUMPY_STRIDE_16_bytes(gpath):
93 2 482 241.0 0.1 pil_img = Image.open(gpath, 'r')
94 # Read PIL image data
95 2 436622 218311.0 94.5 np_img = np.asarray(pil_img)
96 2 10990 5495.0 2.4 np_contig = np.ascontiguousarray(np_img.ravel()
[::16])
97 2 2931 1465.5 0.6 img_bytes_ = np_contig.tostring()
98 2 10902 5451.0 2.4 uuid_ = get_image_uuid(img_bytes_)
99 2 1 0.5 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_STRIDE_16_bytes at line 59
Total time: 0.492819 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
59 @profile
60 def make_uuid_NUMPY_STRIDE_16_bytes(gpath):
61 2 481 240.5 0.1 pil_img = Image.open(gpath, 'r')
62 # Read PIL image data
63 2 441343 220671.5 89.6 np_img = np.asarray(pil_img)
64 2 34 17.0 0.0 np_flat = np_img.ravel()[::16]
65 2 39996 19998.0 8.1 img_bytes_ = np_flat.tostring()
66 2 10964 5482.0 2.2 uuid_ = get_image_uuid(img_bytes_)
67 2 1 0.5 0.0 return uuid_
File: _timeits/time_uuids.py
Function: get_image_uuid at line 28
Total time: 0.545926 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
28 @profile
29 def get_image_uuid(img_bytes_):
30 # hash the bytes using sha1
31 14 545037 38931.2 99.8 bytes_sha1 = hashlib.sha1(img_bytes_)
32 14 115 8.2 0.0 hashbytes_20 = bytes_sha1.digest()
33 # sha1 produces 20 bytes, but UUID requires 16
bytes
34 14 24 1.7 0.0 hashbytes_16 = hashbytes_20[0:16]
35 14 742 53.0 0.1 uuid_ = uuid.UUID(bytes=hashbytes_16)
36 14 8 0.6 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_PIL_bytes at line 39
Total time: 0.625736 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
39 @profile
40 def make_uuid_PIL_bytes(gpath):
41 2 3915 1957.5 0.6 pil_img = Image.open(gpath, 'r')
42 # Read PIL image data
43 2 449092 224546.0 71.8 img_bytes_ = pil_img.tobytes()
44 2 172728 86364.0 27.6 uuid_ = get_image_uuid(img_bytes_)
45 2 1 0.5 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_NUMPY_bytes at line 48
Total time: 0.663057 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
48 @profile
49 def make_uuid_NUMPY_bytes(gpath):
50 2 468 234.0 0.1 pil_img = Image.open(gpath, 'r')
51 # Read PIL image data
52 2 437346 218673.0 66.0 np_img = np.asarray(pil_img)
53 2 18 9.0 0.0 np_flat = np_img.ravel()
54 2 51512 25756.0 7.8 img_bytes_ = np_flat.tostring()
55 2 173712 86856.0 26.2 uuid_ = get_image_uuid(img_bytes_)
56 2 1 0.5 0.0 return uuid_
File: _timeits/time_uuids.py
Function: make_uuid_CONTIG_NUMPY_bytes at line 79
Total time: 0.756671 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
79 @profile
80 def make_uuid_CONTIG_NUMPY_bytes(gpath):
81 2 483 241.5 0.1 pil_img = Image.open(gpath, 'r')
82 # Read PIL image data
83 2 437192 218596.0 57.8 np_img = np.asarray(pil_img)
84 2 48152 24076.0 6.4 np_flat = np_img.ravel().tostring()
85 2 49502 24751.0 6.5 np_contig = np.ascontiguousarray(np_flat)
86 2 49269 24634.5 6.5 img_bytes_ = np_contig.tostring()
87 2 172072 86036.0 22.7 uuid_ = get_image_uuid(img_bytes_)
88 2 1 0.5 0.0 return uuid_
Here are the Windows timeit results:
Running: make_uuid_PIL_bytes
timed: 1.4041314945785952 seconds in make_uuid_PIL_bytes
Running: make_uuid_NUMPY_bytes
timed: 1.4475939890251077 seconds in make_uuid_NUMPY_bytes
Running: make_uuid_NUMPY_STRIDE_16_bytes
timed: 1.136886564762671 seconds in make_uuid_NUMPY_STRIDE_16_bytes
Running: make_uuid_NUMPY_STRIDE_64_bytes
timed: 1.0767879228155284 seconds in make_uuid_NUMPY_STRIDE_64_bytes
Running: make_uuid_CONTIG_NUMPY_bytes
timed: 1.5433727380795146 seconds in make_uuid_CONTIG_NUMPY_bytes
Running: make_uuid_CONTIG_NUMPY_STRIDE_16_bytes
timed: 1.0804961515831941 seconds in make_uuid_CONTIG_NUMPY_STRIDE_16_bytes
Running: make_uuid_CONTIG_NUMPY_STRIDE_64_bytes
timed: 1.0577325560451953 seconds in make_uuid_CONTIG_NUMPY_STRIDE_64_bytes
And the linux timeit results:
Running: make_uuid_PIL_bytes
timed: 0.6316661834716797 seconds in make_uuid_PIL_bytes
Running: make_uuid_NUMPY_bytes
timed: 0.666496992111206 seconds in make_uuid_NUMPY_bytes
Running: make_uuid_NUMPY_STRIDE_16_bytes
timed: 0.4908161163330078 seconds in make_uuid_NUMPY_STRIDE_16_bytes
Running: make_uuid_NUMPY_STRIDE_64_bytes
timed: 0.4494049549102783 seconds in make_uuid_NUMPY_STRIDE_64_bytes
Running: make_uuid_CONTIG_NUMPY_bytes
timed: 0.7838680744171143 seconds in make_uuid_CONTIG_NUMPY_bytes
Running: make_uuid_CONTIG_NUMPY_STRIDE_16_bytes
timed: 0.462860107421875 seconds in make_uuid_CONTIG_NUMPY_STRIDE_16_bytes
Running: make_uuid_CONTIG_NUMPY_STRIDE_64_bytes
timed: 0.45322108268737793 seconds in make_uuid_CONTIG_NUMPY_STRIDE_64_bytes
So it does look like the loading of the image is the main culprit (because these images are so big), but the strides to help the hashing a small (but significant) amount.
Still it would be very nice to be able to load only a subset of that data. Does anyone know any way to do this?
(I'm using Pillow 5.1.0 on Python 3.6.4, on macOS 10.13.3)
I recently had a similar issue while working with images larger than 250MB(!). My use case was slightly different, as I needed actual RGB values, and not bytes, but I found that cropping the image first, and then running getdata() on the cropped area, was much faster for "random access" to a slice of the image. Specifically, on a 30MB image, it's about 28,000 times faster to do img.crop(<x,y,w,h>).getdata()
than img.getdata()[<slice>]
.
>>> t0 = time.time(); x = list(img.getdata())[3336*500:3336*500+3]; t1 = time.time(); print(x, t1-t0)
[(92, 102, 136), (110, 153, 220), (114, 184, 232)] 1.6889581680297852
>>> t0 = time.time(); y = list(img.crop((0, 500, 3, 501)).getdata()); t1 = time.time(); print(y, t1-t0)
[(92, 102, 136), (110, 153, 220), (114, 184, 232)] 5.91278076171875e-05
(1.6 seconds vs. 0.000059 seconds)
Again, this gets you RGB values, not the image byte data, but depending on your needs, this might be acceptable. This also has the side benefit of not requiring numpy, which for me is a plus.
And of course, the logic then depends on how much data you need, and from where, as that might require wrapping around to the next row. That would be ugly, and may not be worth the maintenance/readability cost.