I have a dataset of points in two dimensions that I want to classify using the K-means technique.
The data:
import numpy as np
x1 = np.array([3,1,1,2,1,6,6,6,5,6,7,8,9,8,9,9,8])
x2 = np.array([5,4,5,6,5,8,6,7,6,7,1,2,1,2,3,2,3])
X = np.array(list(zip(x1,x2))).reshape(len(x1), 2)
I want to do an interation over the number of clusters from 1 to 9 to test the final distribution on a scatterplot. So I calculate the centroid of the data set.
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
max_k = 10
K = range(1,max_k)
centroid = [sum(X)/len(X) for k in K]
sst = sum(np.min(cdist(X, centroid, "euclidean"), axis = 1))
Then create a color palette with one rgb
color for each iteration with cm.Spectral
.
color_palette = [plt.cm.Spectral(float(k)/max_k) for k in K]
And use it in the loop where I iterate over k
:
from sklearn.cluster import KMeans
import pandas as pd
ssw = []
for k in K:
kmeanModel = KMeans(n_clusters=k).fit(X)
centers = pd.DataFrame(kmeanModel.cluster_centers_)
labels = kmeanModel.labels_
ssw_k = sum(np.min(cdist(X, kmeanModel.cluster_centers_), axis = 1))
ssw.append(ssw_k)
label_color = [color_palette[i] for i in labels]
plt.plot()
plt.xlim([0,10])
plt.ylim([0,10])
plt.title("Clustering for k = %s"%str(k))
plt.scatter(x1,x2, c=label_color)
plt.scatter(centers[0], centers[1], c=color_palette, marker = "x")
plt.show()
I am reproducing this code in my Python 3.7.3 version and I know, from the source of this piece of code, that it worked fine in older versions. When the function Spectral
from matplotlib.pyplot.cm
was written in lower case (spectral
).
The result is the next.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/matplotlib/axes/_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, **kwargs)
4237 valid_shape = False
-> 4238 raise ValueError
4239 except ValueError:
ValueError:
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-26-2f513f9c616c> in <module>
24 plt.title("Clustering for k = %s"%str(k))
25 plt.scatter(x1,x2, c=label_color)
---> 26 plt.scatter(centers[0], centers[1], c=[i for i in color_palette], marker = "x")
27 plt.show()
~/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, data, **kwargs)
2860 vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths,
2861 verts=verts, edgecolors=edgecolors, **({"data": data} if data
-> 2862 is not None else {}), **kwargs)
2863 sci(__ret)
2864 return __ret
~/anaconda3/lib/python3.7/site-packages/matplotlib/__init__.py in inner(ax, data, *args, **kwargs)
1808 "the Matplotlib list!)" % (label_namer, func.__name__),
1809 RuntimeWarning, stacklevel=2)
-> 1810 return func(ax, *args, **kwargs)
1811
1812 inner.__doc__ = _add_data_doc(inner.__doc__,
~/anaconda3/lib/python3.7/site-packages/matplotlib/axes/_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, **kwargs)
4243 "acceptable for use with 'x' with size {xs}, "
4244 "'y' with size {ys}."
-> 4245 .format(nc=n_elem, xs=x.size, ys=y.size)
4246 )
4247 # Both the mapping *and* the RGBA conversion failed: pretty
ValueError: 'c' argument has 9 elements, which is not acceptable for use with 'x' with size 1, 'y' with size 1.
I expected the center of each group to color just like the group itself.
Thanks in advance.
Try using the corresponding sized color palette via an index which corresponds to the length of the x and y values as following.
P.S: your code works fine in matplotlib 2.2.2
for i, k in enumerate(K):
# rest of your code
plt.scatter(centers[0], centers[1], c=color_palette[0:i+1], marker = "x")
print (centers[0].values)
plt.show()