I want to do clustering using DBSCAN algorithm with a dataset that contains 3 points. This is the dataset :
1 5 7
12 8 9
2 4 10
6 3 21
11 13 0
6 3 21
11 13 0
3 7 1
1 9 2
1 5 7
I do clustering with this code :
from math import sqrt, pow
def __init__(eps=0.1, min_points=2):
eps = 10
min_points = 2
visited = []
noise = []
clusters = []
dp = []
def cluster(data_points):
visited = []
dp = data_points
c = 0
for point in data_points:
if point not in visited:
visited.append(point)
print point
neighbours = region_query(point)
#print neighbours
if len(neighbours) < min_points:
noise.append(point)
else:
c += 1
expand_cluster(c, neighbours)
#cluster(data_points)
def expand_cluster(cluster_number, p_neighbours):
cluster = ("Cluster: %d" % cluster_number, [])
clusters.append(cluster)
new_points = p_neighbours
while new_points:
new_points = pool(cluster, new_points)
def region_query(p):
result = []
for d in dp:
distance = (((d[0] - p[0])**2 + (d[1] - p[1])**2 + (d[2] - p[2])**2)**0.5)
print distance
if distance <= eps:
result.append(d)
return result
#p_neighbours = region_query(p=pcsv)
def pool(cluster, p_neighbours):
new_neighbours = []
for n in p_neighbours:
if n not in visited:
visited.append(n)
n_neighbours = region_query(n)
if len(n_neighbours) >= min_points:
new_neighbours = unexplored(p_neighbours, n_neighbours)
for c in clusters:
if n not in c[1] and n not in cluster[1]:
cluster[1].append(n)
return new_neighbours
@staticmethod
def unexplored(x, y):
z = []
for p in y:
if p not in x:
z.append(p)
return z
in this code there are point
and n
variables which are same with data_points
that contains the dataset. If I read manual I guess this code can work actually, but when I run cluster()
function there is an error.
Traceback (most recent call last):
File "<ipython-input-39-77eb6be20d82>", line 2, in <module>
if n not in visited:
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
I don't know why this code still get that error, whereas I change n
or point
variable with index data. Do you have any idea what's wrong with this code ? how can I make it work?
thank you for your help..
If you use numpy
, you should use masks instead of lists:
def cluster(data_points, eps=0.1, min_points=3):
cluster_numbers = numpy.zeros(len(data_points), dtype=int)
c = 0
for idx, point in enumerate(data_points):
if cluster_numbers[idx] == 0:
print point
neighbours = region_query(data_points, point, eps)
#print neighbours
if sum(neighbours) < min_points:
# noise
cluster_numbers[idx] = -1
else:
c += 1
expand_cluster(c, data_points, cluster_numbers, neighbours, eps)
return cluster_numbers
def region_query(points, point, eps=0.1):
distance = ((points-point)**2).sum(axis=1) ** 0.5
return distance <= eps
def expand_cluster(cluster_number, points, cluster_numbers, new_points, eps=0.1):
while True:
indices = numpy.where(new_points & (cluster_numbers==0))[0]
if not len(indices):
break
new_points = False
for idx in indices:
cluster_numbers[idx] = cluster_number
new_points = new_points | region_query(points, points[idx], eps)
What you get is a array with integer numbers, one for each input point. Positions with -1 as value are noise points, 1 .. n are the different clusters.
So you can get the points for a cluster:
cluster_numbers = cluster(data_points)
noise_points = data_points[cluster_numbers == -1]
print "Total Clusters:", cluster_numbers.max()
for idx in range(1, cluster_numbers.max() + 1):
cluster_points = data_points[cluster_numbers == idx]
print "Cluster %d as %d points" % (idx, len(cluster_points))