Search code examples
pythonnumpyk-means

python kmedoids - calculating new medoid centers more efficiently


I'm following an excellent medium article: https://towardsdatascience.com/k-medoids-clustering-on-iris-data-set-1931bf781e05 to implement kmedoids from scratch. There is a place in the code where each pixel's distance to the medoid centers is calculated and it is VERY slow. It has numpy.linalg.norm inside a loop. Is there a way to optimize this with numpy.linalg.norm or with numpy broadcasting or scipy.spatial.distance.cdist and np.argmin to do the same thing?

###helper function here###
def compute_d_p(X, medoids, p):
    m = len(X)
    medoids_shape = medoids.shape
    # If a 1-D array is provided, 
    # it will be reshaped to a single row 2-D array
    if len(medoids_shape) == 1: 
        medoids = medoids.reshape((1,len(medoids)))
    k = len(medoids)

    S = np.empty((m, k))

    for i in range(m):
        d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
        S[i, :] = d_i**p

    return S

this is where the slowdown occurs


        for datap in cluster_points:
            new_medoid = datap
            new_dissimilarity= np.sum(compute_d_p(X, datap, p))

            if new_dissimilarity < avg_dissimilarity :
                avg_dissimilarity = new_dissimilarity

                out_medoids[i] = datap

Full code below. All credits to the article author.

# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA

# Dataset
iris = datasets.load_iris()
data = pd.DataFrame(iris.data,columns = iris.feature_names)

target = iris.target_names
labels = iris.target

#Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

#PCA Transformation
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(data)
PCAdf = pd.DataFrame(data = principalComponents , columns = ['principal component 1', 'principal component 2','principal component 3'])

datapoints = PCAdf.values
m, f = datapoints.shape
k = 3

def init_medoids(X, k):
    from numpy.random import choice
    from numpy.random import seed

    seed(1)
    samples = choice(len(X), size=k, replace=False)
    return X[samples, :]

medoids_initial = init_medoids(datapoints, 3)

def compute_d_p(X, medoids, p):
    m = len(X)
    medoids_shape = medoids.shape
    # If a 1-D array is provided, 
    # it will be reshaped to a single row 2-D array
    if len(medoids_shape) == 1: 
        medoids = medoids.reshape((1,len(medoids)))
    k = len(medoids)

    S = np.empty((m, k))

    for i in range(m):
        d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
        S[i, :] = d_i**p

    return S

S = compute_d_p(datapoints, medoids_initial, 2)


def assign_labels(S):
    return np.argmin(S, axis=1)

labels = assign_labels(S)

def update_medoids(X, medoids, p):

    S = compute_d_p(points, medoids, p)
    labels = assign_labels(S)

    out_medoids = medoids

    for i in set(labels):

        avg_dissimilarity = np.sum(compute_d_p(points, medoids[i], p))

        cluster_points = points[labels == i]

        for datap in cluster_points:
            new_medoid = datap
            new_dissimilarity= np.sum(compute_d_p(points, datap, p))

            if new_dissimilarity < avg_dissimilarity :
                avg_dissimilarity = new_dissimilarity

                out_medoids[i] = datap

    return out_medoids

def has_converged(old_medoids, medoids):
    return set([tuple(x) for x in old_medoids]) == set([tuple(x) for x in medoids])

#Full algorithm
def kmedoids(X, k, p, starting_medoids=None, max_steps=np.inf):
    if starting_medoids is None:
        medoids = init_medoids(X, k)
    else:
        medoids = starting_medoids

    converged = False
    labels = np.zeros(len(X))
    i = 1
    while (not converged) and (i <= max_steps):
        old_medoids = medoids.copy()

        S = compute_d_p(X, medoids, p)

        labels = assign_labels(S)

        medoids = update_medoids(X, medoids, p)

        converged = has_converged(old_medoids, medoids)
        i += 1
    return (medoids,labels)

results = kmedoids(datapoints, 3, 2)
final_medoids = results[0]
data['clusters'] = results[1]



Solution

  • There's a good chance numpy's broadcasting capabilities will help. Getting broadcasting to work in 3+ dimensions is a bit tricky, and I usually have to resort to a bit of trial and error to get the details right.

    The use of linalg.norm here compounds things further, because my version of the code won't give identical results to linalg.norm for all inputs. But I believe it will give identical results for all relevant inputs in this case.

    I've added some comments to the code to explain the thinking behind certain details.

    def compute_d_p_broadcasted(X, medoids, p):
    
        # If a 1-D array is provided, 
        # it will be reshaped to a single row 2-D array
    
        if len(medoids.shape) == 1: 
            medoids = medoids.reshape((1,len(medoids)))
    
        # In general, broadcasting n-dim arrays requires that the last
        # dim of the first array be a singleton dimension, and that the
        # first dim of the second array be a singleton dimension. We can
        # quickly accomplish that by slicing with `None` in the appropriate
        # places. (`np.newaxis` is a slightly more self-documenting way
        # of spelling `None`, but I rarely bother.)
    
        # In this case, the shapes of the other two dimensions also
        # have to align in the same way you'd expect for a dot product.
        # So we pass `medoids.T`.
    
        diff = np.abs(X[:, :, None] - medoids.T[None, :, :])
    
        # The last tricky bit is to figure out which axis to sum. Right
        # now, the array is a 3-dimensional array, with the first 
        # dimension corresponding to the rows of `X` and the last 
        # dimension corresponding to the columns of `medoids.T`. 
        # The middle dimension corresponds to the underlying dimensionality
        # of the space; that's what we want to sum for a sum of squares.
        # (Or sum of cubes for L3 norm, etc.)
    
        return (diff ** p).sum(axis=1)
    
    def compute_d_p(X, medoids, p):
        m = len(X)
        medoids_shape = medoids.shape
        # If a 1-D array is provided, 
        # it will be reshaped to a single row 2-D array
        if len(medoids_shape) == 1: 
            medoids = medoids.reshape((1,len(medoids)))
        k = len(medoids)
        S = np.empty((m, k))
        for i in range(m):
            d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
            S[i, :] = d_i**p
        return S
    
    # A couple of simple tests:
    X = np.array([[ 1.0,  2,  3],
                  [ 4,  5,  6],
                  [ 7,  8,  9],
                  [10, 11, 12]])
    
    medoids = X[[0, 2], :]
    
    np.allclose(compute_d_p(X, medoids, 2), 
                compute_d_p_broadcasted(X, medoids, 2))
    # Returns True
    np.allclose(compute_d_p(X, medoids, 3),
                compute_d_p_broadcasted(X, medoids, 3))
    # Returns True
    

    Of course, these tests don't tell whether this actually gives a significant speedup. You'll have to check that yourself for the relevant use-case. But I suspect it will at least help.