Search code examples
pythoncluster-analysisambiguousdbscan

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all() python dbscan 3 dimensions point


I want to do clustering using DBSCAN algorithm with a dataset that contains 3 points. This is the dataset :

1   5   7
12  8   9
2   4   10
6   3   21
11  13  0
6   3   21
11  13  0
3   7   1
1   9   2
1   5   7

I do clustering with this code :

from math import sqrt, pow

def __init__(eps=0.1, min_points=2):
    eps = 10
    min_points = 2
    visited = []
    noise = []
    clusters = []
    dp = []

def cluster(data_points):
    visited = []
    dp = data_points
    c = 0

    for point in data_points:
        if point not in visited:
            visited.append(point)
            print point
            neighbours = region_query(point)
            #print neighbours
            if len(neighbours) < min_points:
                noise.append(point)

            else:
                c += 1
                expand_cluster(c, neighbours)

#cluster(data_points)

def expand_cluster(cluster_number, p_neighbours):
    cluster = ("Cluster: %d" % cluster_number, [])
    clusters.append(cluster)
    new_points = p_neighbours
    while new_points:
        new_points = pool(cluster, new_points)


def region_query(p):
    result = []
    for d in dp:
        distance = (((d[0] - p[0])**2 + (d[1] - p[1])**2 + (d[2] - p[2])**2)**0.5)
        print distance
        if distance <= eps:
            result.append(d)
    return result

#p_neighbours = region_query(p=pcsv)

def pool(cluster, p_neighbours):
    new_neighbours = []
    for n in p_neighbours:
        if n not in visited:
            visited.append(n)
            n_neighbours = region_query(n)
            if len(n_neighbours) >= min_points:
                new_neighbours = unexplored(p_neighbours, n_neighbours)
        for c in clusters:
            if n not in c[1] and n not in cluster[1]:
                cluster[1].append(n)
    return new_neighbours

@staticmethod
def unexplored(x, y):
    z = []
    for p in y:
        if p not in x:
            z.append(p)
    return z

in this code there are point and n variables which are same with data_points that contains the dataset. If I read manual I guess this code can work actually, but when I run cluster() function there is an error.

Traceback (most recent call last):

  File "<ipython-input-39-77eb6be20d82>", line 2, in <module>
    if n not in visited:

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

I don't know why this code still get that error, whereas I change n or point variable with index data. Do you have any idea what's wrong with this code ? how can I make it work?

thank you for your help..


Solution

  • If you use numpy, you should use masks instead of lists:

    def cluster(data_points, eps=0.1, min_points=3):
        cluster_numbers = numpy.zeros(len(data_points), dtype=int)
        c = 0
        for idx, point in enumerate(data_points):
            if cluster_numbers[idx] == 0:
                print point
                neighbours = region_query(data_points, point, eps)
                #print neighbours
                if sum(neighbours) < min_points:
                    # noise
                    cluster_numbers[idx] = -1
                else:
                    c += 1
                    expand_cluster(c, data_points, cluster_numbers, neighbours, eps)
        return cluster_numbers
    
    def region_query(points, point, eps=0.1):
        distance = ((points-point)**2).sum(axis=1) ** 0.5
        return distance <= eps
    
    def expand_cluster(cluster_number, points, cluster_numbers, new_points, eps=0.1):
        while True:
            indices = numpy.where(new_points & (cluster_numbers==0))[0]
            if not len(indices):
                break
            new_points = False
            for idx in indices:
                cluster_numbers[idx] = cluster_number
                new_points = new_points | region_query(points, points[idx], eps)
    

    What you get is a array with integer numbers, one for each input point. Positions with -1 as value are noise points, 1 .. n are the different clusters.

    So you can get the points for a cluster:

    cluster_numbers = cluster(data_points)
    noise_points = data_points[cluster_numbers == -1]
    print "Total Clusters:", cluster_numbers.max()
    for idx in range(1, cluster_numbers.max() + 1):
        cluster_points = data_points[cluster_numbers == idx]
        print "Cluster %d as %d points" % (idx, len(cluster_points))