Search code examples
pythonnumpyresumable

Resuming count with numpy.unique from previously computed count


I was wondering if there were already existing solutions for "resumable" computations with numpy.

Let me explain: I have a folder with a big amount of grayscale images over which I need to compute a sort of histogram using the numpy.unique function. My code looks like this:

from os import listdir
from os.path import isfile, join
import numpy as np
import matplotlib.image as img
import matplotlib.pyplot as plt

# storing all the images' names that need to be processed into a list:
work_dir = 'path/to/my/images'
images = [(work_dir + '/' + f) for f in listdir(work_dir) if isfile(join(work_dir, f))]

# allocating array that will contain the images' data:
nz = len(images)
nx, ny = img.imread(images[0]).shape
volume = np.zeros((nx, ny, nz), img.imread(images[0]).dtype)
print(volume.shape, nx*ny*nz, volume.dtype)

# loading the images into the allocated array:
for i in range(nz):
    volume[:,:,i] = img.imread(images[i])

# computing the histogram as the number of occurrences of each unique value in volume:
values, counts = np.unique(volume, return_counts=True)
plt.plot(values, counts)

The problem is that my computer doesn't have enough RAM to allocate the necessary memory for both volume, values and counts arrays.

So is there an already existing solution that would look like this:

from os import listdir
from os.path import isfile, join
import numpy as np
import matplotlib.image as img
import matplotlib.pyplot as plt

# storing all the images' names that need to be processed into a list:
work_dir = 'path/to/my/images'
images = [(work_dir + '/' + f) for f in listdir(work_dir) if isfile(join(work_dir, f))]

# computing the histogram as the number of occurrences of each unique value in the first image:
values, counts = np.unique(img.imread(images[0]), return_counts=True)

# updating values and counts to include data from the other images:
for i in range(len(images)):
    old_values, old_counts = values, counts
    values, counts = update_unique(img.imread(images[i]), old_values, old_counts, return_counts=True)

plt.plot(values, counts)

I would rather avoid having to implement something myself because of time constraints. I am also open to alternatives that do not use numpy or even python.


Solution

  • I've had a little free time, so I tried to figure out how to do this on my own. I'm posting it here in case someone is interested in doing something similar. I believe my solution should be general enough that it can be used for other computations that need to aggregate the results from several separate computations while resolving duplicates.

    from os import listdir
    from os.path import isfile, join
    import numpy as np
    import matplotlib.image as img
    import matplotlib.pyplot as plt
    
    def update_unique(a, old_values, old_counts):
        # first, compute the values and counts of a
        new_values, new_counts = np.unique(a, return_counts=True)
    
        # we're going to go through all the content of old_values and new_values
        M, N = len(old_values), len(new_values)
        i, j, k = 0, 0, 0
        # and as we do so, we're going to store the combined values and counts in two new arrays and, although we don't know the final size of the combined values and counts, at most we will be keeping the full content of old_values nd new_values
        full_values = np.zeros(M + N, dtype=old_values.dtype)
        full_counts = np.zeros(M + N, dtype=old_counts.dtype)
        while i < M or j < N:
            # the trick is to remember that the values output by unique are already sorted, so there can only be 3 scenarios at each step plus 2 edge cases when we've reached the end of either old_values or new_values
            if i >= M:
                full_values[k] = new_values[j]
                full_counts[k] = new_counts[j]
                j += 1
                k += 1
            elif j >= N:
                full_values[k] = old_values[i]
                full_counts[k] = old_counts[i]
                i += 1
                k += 1
            else:
                if old_values[i] < new_values[j]:
                    full_values[k] = old_values[i]
                    full_counts[k] = old_counts[i]
                    i += 1
                    k += 1
                elif old_values[i] > new_values[j]:
                    full_values[k] = new_values[j]
                    full_counts[k] = new_counts[j]
                    j += 1
                    k += 1
                else: # old_values[i] == new_values[j]
                    full_values[k] = old_values[i]
                    full_counts[k] = old_counts[i] + new_counts[j]
                    i += 1
                    j += 1
                    k += 1
        # at the end, we just need to truncate the unused memory
        return full_values[:k], full_counts[:k]
    
    def unique_over_folder(files_list):
        # computing the histogram as the number of occurrences of each unique value in the first image:
        values, counts = np.unique(img.imread(files_list[0]), return_counts=True)
    
        # updating values and counts to include data from the other images:
        for i in range(1, len(files_list)):
            values, counts = update_unique(
                img.imread(files_list[i]),
                values, counts
            )
    
        return values, counts
    
    # storing all the images' names that need to be processed into a list:
    work_dir = 'path/to/your/data'
    images = [(work_dir + '/' + f) for f in listdir(work_dir) if isfile(join(work_dir, f))]
    
    values0, counts0 = np.unique(img.imread(images[0]), return_counts=True)
    values1000, counts1000 = np.unique(img.imread(images[1000]), return_counts=True)
    values_fus, counts_fus = update_unique(img.imread(images[1000]), values0, counts0)
    
    # Example combining two values/counts pairs:
    fig, axs = plt.subplots(2, 1, sharex=True)
    
    axs[0].plot(values0, counts0, '.', label='image 0')
    axs[0].plot(values1000, counts1000, '.', label='image 1000')
    axs[0].plot(values_fus, counts_fus, '.', label='combined')
    
    axs[0].legend()
    axs[0].set_title('Example combining two values/counts pairs')
    
    # Example with full list of images:
    values, counts = unique_over_folder(images)
    
    axs[1].plot(values, counts, '.')
    axs[1].set_title('Example with full list of images');
    
    fig.savefig('exampes.png')