Search code examples
python-3.xnumpyhistogramnested-loopsbinning

How to merge histogram bins (edges and counts) by bin-count condition?


The Problem

I have a histogram of data that I would like to manipulate. More specifically, I would like to merge bins whose counts are less than a given threshold. This might be clearer with an example.

import numpy as np

np.random.seed(327)

data = np.random.normal(loc=50, scale=10, size=100).astype(int)
edges = np.arange(0, 101, 10).astype(int)
counts, edges = np.histogram(data, edges)

# print("\n .. {} DATA:\n{}\n".format(data.shape, data))
# print("\n .. {} EDGES:\n{}\n".format(edges.shape, edges))
# print("\n .. {} COUNTS:\n{}\n".format(counts.shape, counts))

The print commands above will output the following if not commented out:

 .. (100,) DATA:
[67 46 47 32 59 61 49 46 45 72 67 51 41 37 44 56 38 61 45 45 42 39 49 55
 32 35 52 40 55 34 52 51 39 55 50 62 47 43 48 39 53 54 75 38 53 44 46 39
 50 49 31 46 55 64 64 52 41 34 32 33 58 65 38 64 37 47 58 43 49 49 50 57
 71 44 41 39 47 51 47 63 55 52 43 43 49 65 48 43 44 38 64 49 62 41 40 67
 47 55 57 54]


 .. (11,) EDGES:
[  0  10  20  30  40  50  60  70  80  90 100]


 .. (10,) COUNTS:
[ 0  0  0 19 38 26 14  3  0  0]

Notice that counts suggests that data contains a single peak. Suppose I chose a bin threshold threshold=5 such that any bin containing less than 5 counts (0, ..., 4 counts; not including 5) is merged with the next bin. Here, next is taken to be in the direction towards the central peak.

Desired Output

By my desired merging algorithm, I would obtain the following output:

edges = [30, 40, 50, 60, 80]
counts = [19, 38, 26, 17]

Attempt at Solution

Below is my incorrect attempt at solving this problem:

def agglomerate_bins(edges, counts, threshold):
    condition = (counts >= threshold)
    indices = {}
    indices['all'] = condition
    indices['above'] = np.where(condition == True)[0]
    indices['below'] = np.where(condition != True)[0]
    # merge left-side bins rightward
    left_edges = [edges[0]]
    left_counts = []
    ileft, istop = indices['below'][0], indices['above'][0]
    while ileft < istop:
        cc = counts[ileft]
        while cc < threshold:
            ileft += 1
            cc += counts[ileft]
        ee = edges[ileft]
        left_edges.append(ee)
        left_counts.append(cc)
        ileft += 1
    # merge right-side bins leftward
    right_edges, right_counts = [], []
    iright, istop = indices['below'][-1], indices['above'][-1]
    while iright > istop:
        cc = counts[iright]
        while cc < threshold:
            iright -= 1
            cc += counts[iright]
        ee = edges[iright]
        right_edges.append(ee)
        right_counts.append(cc)
        iright -= 1
    # group modified bins with bins above threshold
    middle_edges = edges[indices['above']].tolist()
    middle_counts = edges[indices['above']].tolist()
    mod_edges = np.array(left_edges + middle_edges + right_edges[::-1])
    mod_counts = np.array(left_counts + middle_counts + right_counts[::-1])
    return mod_edges, mod_counts

mod_edges, mod_counts = agglomerate_bins(edges, counts, threshold=5)
# print("\n .. {} MODIFIED EDGES:\n{}\n".format(mod_edges.shape, mod_edges))
# print("\n .. {} MODIFIED COUNTS:\n{}\n".format(mod_counts.shape, mod_counts))

The print commands above will output the following if not commented out:

 .. (7,) MODIFIED EDGES:
[ 0 30 30 40 50 60 60]


 .. (6,) MODIFIED COUNTS:
[19 30 40 50 60 17]

Solution

  • I think a solution involves iterating through the counts and edges consolidating counts and removing 'unused' edges. This catches [ ..., 1,2,3,...] => [..., 6, ...]. counts and edges are converted to lists which allows unwanted items to be easily popped, this isn't efficient with np.arrays.

    import numpy as np
    
    np.random.seed(327)
    
    data = np.random.normal(loc=50, scale=10, size=100).astype(int)
    edges = np.arange(0, 101, 10).astype(int)
    counts, edges = np.histogram(data, edges)
    
    def combine_edges( counts, edges, threshold ):
        max_ix = counts.argmax()
        c_list = list( counts )   # Lists can be popped from
        e_list = list( edges )    # Lists can be popped from
    
        def eliminate_left( ix ):
            # Sum the count and eliminate the edge relevant to ix
            # Before the peak (max_ix)
            nonlocal max_ix
            max_ix -= 1         # max_ix will change too.
            c_list[ix+1]+=c_list[ix]
            c_list.pop(ix)
            e_list.pop(ix+1)
    
        def eliminate_right( ix ):
            # Sum the count and eliminate the edge relevant to ix
            # after the peak (max_ix) 
            c_list[ix-1]+=c_list[ix]
            c_list.pop(ix)
            e_list.pop(ix)
    
        def first_lt():
            # Find the first ix less than the threshold
            for ix, ct in enumerate( c_list[:max_ix] ):
                if ct < threshold:
                    return ix  # if ct < threshold return the index and exit the function
            # The function only reaches here if no ct values are less than the threshold
            return -1  # If zero items < threshold return -1
    
        def last_lt():
            # Find the last ix less than the threshold
            for ix, ct in zip( range(len(c_list)-1, max_ix, -1), c_list[::-1]):
                # ix reduces from len(c_list)-1, c_list is accessed in reverse order.
                if ct < threshold:
                    return ix
            return -1  # If no items < threshold return -1
    
        cont = True
        while cont:
            # Each iteration removes any counts less than threshold
            # before the peak.  This process would combine e.g. counts of [...,1,2,3,...] into [..., 6, ...]
            ix = first_lt()
            if ix < 0:
                cont = False   # If first_lt returns -1 stop while loop
            else:
                eliminate_left( ix )
    
        cont = True
        while cont:
            ix = last_lt()
            if ix < 0:
                cont = False   # If last_lt returns -1 stop while loop
            else:
                eliminate_right( ix )
    
        return np.array( c_list ), np.array( e_list )
    
    c, e = combine_edges( counts, edges, 5)
    
    print( c, '\n', e )
    # [19 38 26 17] 
    # [  0  40  50  60 100]
    
    cts, edgs = np.histogram(data, e)
    
    print( cts, '\n', edgs )
    # [19 38 26 17] 
    # [  0  40  50  60 100]
    

    This feels clumsy so there may be a better way but it does work. Does it handle consecutive items less than the threshold as required?

    Edit To answer the comment on how the first_lt works. The comments in the code above have been updated.

    Alternative implementation with only one return.

    def first_lt():
        result = -1  # Set default
        for ix, ct in enumerate( c_list[:max_ix] ):
            if ct < threshold:
                result = ix  # If ct < threshold set result to ix
                break        # Break out of the loop
        return result
    

    first_lt with print statements to show what is happening as it's executed.

    def first_lt():
        print('first_lt:',end='  ')
        for ix, ct in enumerate( c_list[:max_ix] ):
            print(ix,ct, end=': ')
            if ct < threshold:
                print('Return ix.')
                return ix
        print('Exiting loop, return -1')
        return -1