Search code examples
pythonmatplotlibcluster-analysisk-means

how to graph kmeans?


I am working with a data set and trying to learn Kmeans clustering, I am working with the following code:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Create Points to cluster
Points = pd.DataFrame()
Points.loc[:,0] = [243,179,152,255,166,162,233,227,204,341,283,202,217,197,191,114,
      153,215,196,187,127,85,182,172,184,252,193,191,187,193,197,200,
      186,188,155,-99,22,68,167,-75,30,49,63,45,58,52,164,51,49,68,52,43,68,
      72,-51,59,56,-127,33,68,143,-26,-85,84,11,105,62,47,-75,2,67,-41,-33,
      10,28,23,34,19,13,6,-73,155,30]
Points.loc[:,1] = [2.1,4,2.6,2.1,2.5,0.4,0.3,4.9,1.1,1,-1.5,3.3,2.2,1.9,2.4,2.2,0.9,
      1.8,1.7,3.2,2.4,4.4,1.4,4.4,2.6,0.6,2.9,3.8,2.6,8.5,8.8,7.5,8.3,8.
      5,3.5,6.3,-1.4,-0.4,3,-5.2,-2.7,-3.2,-0.8,-3.9,-0.6,0.9,-5.1,-2.2,
      -0.3,-1.2,0.1,-2.1,-2.1,3.7,11.8,0,0,-6.6,-1,10.1,11.9,-3,-22,-18.2,-13.3,
      -8.4,-21.7,-16.7,-13.8,-13.9,-13.2,-14.9,-21.6,-16.4,-14.4,-15.8,
      -15.3,-15.3,-2.7,-13.2,-8.9,-3.3,-12.9]

# Create initial cluster centroids
ClusterCentroidGuesses = pd.DataFrame()
ClusterCentroidGuesses.loc[:,0] = [100, 200, 0]
ClusterCentroidGuesses.loc[:,1] = [2, -2, 0]

def Plot2DKMeans(Points, Labels, ClusterCentroids, Title):
    for LabelNumber in range(max(Labels)+1):
        LabelFlag = Labels == LabelNumber
        color =  ['c', 'm', 'y', 'b', 'g', 'r', 'c', 'm', 'y', 
                  'b', 'g', 'r', 'c', 'm', 'y'][LabelNumber]
        marker = ['s', 'o', 'v', '^', '<', '>', '8', 'p', '*', 
                  'h', 'H', 'D', 'd', 'P', 'X'][LabelNumber]
        plt.scatter(Points.loc[LabelFlag,0], Points.loc[LabelFlag,1],
                    s= 100, c=color, edgecolors="black", alpha=0.3, marker=marker)
        plt.scatter(ClusterCentroids.loc[LabelNumber,0],
                    ClusterCentroids.loc[LabelNumber,1], 
                    s=200, c="black", marker=marker)
    plt.title(Title)
    plt.show()

def KMeansNorm(Points, ClusterCentroidGuesses, NormD1, NormD2):
    PointsNorm = Points.copy()
    ClusterCentroids = ClusterCentroidGuesses.copy()
    if NormD1:
        # Determine mean of 1st dimension
        mean1 = np.mean(PointsNorm[:,0])
        # Determine standard deviation of 1st dimension
        std1 = np.std(PointsNorm[:,0])
        # Normalize 1st dimension of Points
        PointsNorm[:,0] = ((PointsNorm[:,0] - mean1)/std1) 
        # Normalize 1st dimension of ClusterCentroids
        Cmean1 = np.mean(ClusterCentroids[:,0])
        Cstd1 = np.std(ClusterCentroids[:,0])
        ClusterCentroids[:,0] = ((ClusterCentroids[:,0] - Cmean1)/Cstd1)
    if NormD2:
        # Determine mean of 2nd dimension
        mean2 = np.mean(PointsNorm[:,1])
        # Determine standard deviation of 2nd dimension
        std2 = np.std(PointsNorm[:,1])
        # Normalize 2nd dimension of Points
        PointsNorm[:,1] = ((PointsNorm[:,1] - mean2)/std2) 
        # Normalize 2nd dimension of ClusterCentroids
        Cmean2 = np.mean(ClusterCentroids[:,1])
        Cstd2 = np.std(ClusterCentroids[:,1])
        ClusterCentroids[:,1] = ((ClusterCentroids[:,1] - Cmean2)/Cstd2)
    # Do actual clustering
    kmeans = KMeans(n_clusters=3, init=ClusterCentroidGuesses, n_init=1).fit(PointsNorm)
    Labels = kmeans.labels_
    ClusterCentroids = pd.DataFrame(kmeans.cluster_centers_)
    if NormD1:
        # Denormalize 1st dimension
        PointsNorm[:,0] = PointsNorm[:,0]*std1+mean1
        ClusterCentroids[:,0] = ClusterCentroids[:0]*Cstd1+Cmean1
    if NormD2:
        # Denormalize 2nd dimension
        PointsNorm[:,1] = PointsNorm[:,1]*std2+mean2
        ClusterCentroids[:,1] = ClusterCentroids[:1]*Cstd2+Cmean2
    return Labels, ClusterCentroids

# Compare distributions of the two dimensions
plt.rcParams["figure.figsize"] = [6.0, 4.0] # Standard
plt.hist(Points.loc[:,0], bins = 20, color=[0, 0, 1, 0.5])
plt.hist(Points.loc[:,1], bins = 20, color=[1, 1, 0, 0.5])
plt.title("Compare Distributions")
plt.show()

# Change the plot dimensions
plt.rcParams["figure.figsize"] = [8, 8] # Square
# plt.rcParams["figure.figsize"] = [8, 0.5] # Wide
# plt.rcParams["figure.figsize"] = [0.5, 8] # Tall

# Cluster without normalization
# Are the points separated into clusters along one or both dimensions?
# Which dimension separates the points into clusters?
# Set Normalizations
NormD1=False
NormD2=False
Labels, ClusterCentroids = KMeansNorm(Points, ClusterCentroidGuesses, NormD1, NormD2)
Title = 'No Normalization'
Plot2DKMeans(Points, Labels, ClusterCentroids, Title)

# Set Normalizations
NormD1=True
NormD2=False
Labels, ClusterCentroids = KMeansNorm(Points, ClusterCentroidGuesses, NormD1, NormD2)
Title = 'No Normalization'
Plot2DKMeans(Points, Labels, ClusterCentroids, Title)

When trying to graph NormD1=True I receive an error code that reads

TypeError: '(slice(None, None, None), 0)' is an invalid key

Can someone help me understand where I am going wrong?


Solution

  • It seems like you are over-engineering this thing! Or, maybe you are trying to learn the mechanics of KMeans. Let's get it simplified, make that work, and then you can extrapolate the simple to something more complex. Here is a simple example for you to get things started.

    # K-MEANS CLUSTERING
    # Importing Modules
    from sklearn import datasets
    from sklearn.cluster import KMeans
    import matplotlib.pyplot as plt
    from sklearn.decomposition import PCA
    from mpl_toolkits.mplot3d import Axes3D
    
    # Loading dataset
    iris_df = datasets.load_iris()
    
    # Declaring Model
    model = KMeans(n_clusters=3)
    
    # Fitting Model
    model.fit(iris_df.data)
    
    # Predicitng a single input
    predicted_label = model.predict([[7.2, 3.5, 0.8, 1.6]])
    
    # Prediction on the entire data
    all_predictions = model.predict(iris_df.data)
    
    # Printing Predictions
    print(predicted_label)
    print(all_predictions)
    
    
    # import some data to play with
    iris = datasets.load_iris()
    X = iris.data[:, :3]  # we only take the first two features.
    y = iris.target
    
    
    fig = plt.figure(figsize=(10,10))
    plt = fig.add_subplot(1, 1, 1, projection='3d')
    plt.scatter(X[:,0],X[:,1],X[:,2], 
                c=all_predictions, edgecolor='red', s=40, alpha = 0.5)
    plt.set_title("First three PCA directions")
    plt.set_xlabel("Educational_Degree")
    plt.set_ylabel("Gross_Monthly_Salary")
    plt.set_zlabel("Claim_Rate")
    plt.dist = 10
    plt
    

    enter image description here

    Personally, I believe 3D charts are better for rendering KMeans data points. Sometimes 2D charts work well, but often they can be lacking detail, and thus a misrepresentation of what's really going on with the dataset. Finally, the dataset should be normally partitioned to begin with, or you may get some really weird results!