Search code examples
pythonnumpymatrixpoint-clouds

Efficiently calculating grid-based point density in 3d point cloud


I have a 3d point cloud matrix, and I am trying to calculate the largest point density within a smaller volume inside the matrix. I am currently using a 3D grid-histogram system where I loop through every point in the matrix and increase the value of the corresponding grid square. Then, I can simply find the max value of the grid matrix.

I have already written code that works, but it is horribly slow for what I am trying to do

import numpy as np

def densityPointCloud(points, gridCount, gridSize):
    hist = np.zeros((gridCount, gridCount, gridCount), np.uint16)

    rndPoints = np.rint(points/gridSize) + int(gridCount/2)
    rndPoints = rndPoints.astype(int)


    for point in rndPoints:
        if np.amax(point) < gridCount and np.amin(point) >= 0:
            hist[point[0]][point[1]][point[2]] += 1

    return hist


cloud = (np.random.rand(100000, 3)*10)-5
histogram = densityPointCloud(cloud , 50, 0.2)
print(np.amax(histogram))

Are there any shortcuts I can take to do this more efficiently?


Solution

  • Here's a start:

    import numpy as np
    import time
    from collections import Counter
    
    # if you need the whole histogram object
    def dpc2(points, gridCount, gridSize):
    
        hist = np.zeros((gridCount, gridCount, gridCount), np.uint16)
        rndPoints = np.rint(points/gridSize) + int(gridCount/2)
        rndPoints = rndPoints.astype(int)
        inbounds = np.logical_and(np.amax(rndPoints,axis = 1) < gridCount, np.amin(rndPoints,axis = 1) >= 0)
    
        for point in rndPoints[inbounds,:]:
            hist[point[0]][point[1]][point[2]] += 1
    
        return hist
    
    # just care about a max point
    def dpc3(points, gridCount, gridSize):
    
        rndPoints = np.rint(points/gridSize) + int(gridCount/2)
        rndPoints = rndPoints.astype(int)
        inbounds = np.logical_and(np.amax(rndPoints,axis = 1) < gridCount,
            np.amin(rndPoints,axis = 1) >= 0)
        # cheap hashing
        phashes = gridCount*gridCount*rndPoints[inbounds,0] + gridCount*rndPoints[inbounds,1] + rndPoints[inbounds,2]
        max_h, max_v = Counter(phashes).most_common(1)[0]
    
        max_coord = [(max_h // (gridCount*gridCount)) % gridCount,(max_h // gridCount) % gridCount,max_h % gridCount]
        return (max_coord, max_v)
    
    # TESTING
    cloud = (np.random.rand(200000, 3)*10)-5
    t1 = time.perf_counter()
    hist1 = densityPointCloud(cloud , 50, 0.2)
    t2 = time.perf_counter()
    hist2 = dpc2(cloud,50,0.2)
    t3 = time.perf_counter()
    hist3 = dpc3(cloud,50,0.2)
    t4 = time.perf_counter()
    print(f"task 1: {round(1000*(t2-t1))}ms\ntask 2: {round(1000*(t3-t2))}ms\ntask 3: {round(1000*(t4-t3))}ms")
    print(f"max value is {hist3[1]}, achieved at {hist3[0]}")
    np.all(np.equal(hist1,hist2)) # check that results are identical
    # check for equal max - histogram may be multi-modal so the point won't
    # necessarily match
    np.unravel_index(np.argmax(hist2, axis=None), hist2.shape)
    

    The idea is to do all the if/and comparisons once: let numpy do them (effectively in C) rather then doing them 'manually' inside a Python loop. This also lets us only iterate over the points that will lead to hist being incremented.

    You can also consider using a sparse data structure for hist if you think your cloud will have lots of empty space - memory allocation can become a bottleneck for very large data.

    Did not scientifically benchmark this but appears to run ~2-3x faster (v2) and 6-8x faster (v3)! If you'd like all the points which are tied for the max. density, it would be easy to extract those from the Counter object.