Search code examples
pythonscipycluster-analysis

Cluster groups by direction and magnitude - Python


I'm hoping to cluster vectors based on the direction and magnitude using python. I've found limited examples using R but none for python. Not to confuse with standard k-means for scatter points, I'm actually trying to cluster the whole vector.

The following takes two sets of xy points to generate a vector. I'm then hoping to cluster these vectors based on the length and direction.

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans

df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
plt.rcParams['image.cmap'] = 'Paired'

fig,ax = plt.subplots()
ax.set_xlim(-5, 25)
ax.set_ylim(-5, 25)

A = df['A']
B = df['B']

C = df['C']
D = df['D']

ax.quiver(A, B, (C-A), (D-B), angles = 'xy', scale_units = 'xy', scale = 1, alpha = 0.5) 

X_1 = np.array(df[['A','B','C','D']])

model = KMeans(n_clusters = 20)
model.fit(X_1)

cluster_labels = model.predict(X_1)
df['n_cluster'] = cluster_labels
centroids_1 = pd.DataFrame(data = model.cluster_centers_, columns = ['start_x', 'start_y', 'end_x', 'end_y'])
cc = model.cluster_centers_

a = cc[:, 0]
b = cc[:, 1]
c = cc[:, 2]
d = cc[:, 3]

lc1 = ax.quiver(a, b, (c-a), (d-b), angles = 'xy', scale_units = 'xy', scale = 1, alpha = 0.8)

The following figure displays an example


Solution

  • What about this :

    import pandas as pd
    import matplotlib.pyplot as plt
    import numpy as np
    
    import hdbscan
    
    df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
    plt.rcParams['image.cmap'] = 'Paired'
    
    A = df['A'] #X start
    B = df['B'] #Y start
    C = df['C'] #X arrive
    D = df['D'] #Y arrive
    
    clusterer = hdbscan.HDBSCAN()
    
    df['LENGTH'] = np.sqrt(np.square(df.C-df.A) + np.square(df.D-df.B))
    df['DIRECTION'] = np.degrees(np.arctan2(df.D-df.B, df.C-df.A))
    
    
    coords = df[['LENGTH', 'DIRECTION']].values
    clusterer.fit_predict(coords)
    
    cluster_labels = clusterer.labels_
    num_clusters = len(set(cluster_labels))
    clusters = pd.DataFrame(
            [(coords[cluster_labels==n], len(coords[cluster_labels==n])) for n in range(num_clusters)],
            columns=["points", "weight"]
            )
    
    colors = {0:"green", 1:"blue", 2:"red", 3:"yellow", 4:"pink"}
    df['CLUSTER'] = np.nan
    for x, (cluster, weight) in enumerate(clusters[clusters.weight>0].values.tolist()):
        df_this_cluster = pd.DataFrame(cluster, columns=['LENGTH', 'DIRECTION'])
        df_this_cluster['TEMP'] = x
        df = df.merge(df_this_cluster, on=['LENGTH', 'DIRECTION'], how='left')
        ix = df[df.TEMP.notnull()].index
        df.loc[ix, "CLUSTER"] = df.loc[ix, "TEMP"]
        df.drop("TEMP", axis=1, inplace=True)
    df['COLOR'] = df['CLUSTER'].map(colors).fillna('black')
    
    fig,ax = plt.subplots()
    ax.set_xlim(-5, 25)
    ax.set_ylim(-5, 25)
    
    ax.quiver(df.A, df.B, (df.C-df.A), (df.D-df.B), angles='xy', scale_units='xy', scale=1, alpha=0.5, color=df.COLOR) 
    

    This will use clustering based on length and direction (direction being transformed to degrees, radians' small range doesn't match very well with the model on my first try).

    I don't think this will be a very "cartesian" solution as the two values beeing analysed in the model are not in the same metrics... But the visual results are not so bad...

    I did try another match based on the 4 coordinates, which is more rigorous. But it is (quite expectably) clustering the vectors by subareas of the space (when there are any) :

    coords = df[['A', 'B', 'C', 'D']].values
    clusterer.fit_predict(coords)
    
    cluster_labels = clusterer.labels_
    num_clusters = len(set(cluster_labels))
    clusters = pd.DataFrame(
            [(coords[cluster_labels==n], len(coords[cluster_labels==n])) for n in range(num_clusters)],
            columns=["points", "weight"]
            )
    
    colors = {0:"green", 1:"blue", 2:"red", 3:"yellow", 4:"pink"}
    df['CLUSTER'] = np.nan
    for x, (cluster, weight) in enumerate(clusters[clusters.weight>0].values.tolist()):
        df_this_cluster = pd.DataFrame(cluster, columns=['A', 'B', 'C', 'D'])
        df_this_cluster['TEMP'] = x
        df = df.merge(df_this_cluster, on=['A', 'B', 'C', 'D'], how='left')
        ix = df[df.TEMP.notnull()].index
        df.loc[ix, "CLUSTER"] = df.loc[ix, "TEMP"]
        df.drop("TEMP", axis=1, inplace=True)
    df['COLOR'] = df['CLUSTER'].map(colors).fillna('black')
    

    EDIT

    I gave it another try, based on the (very good) suggestion that angles are not a good variable given the fact that there are discontinuities around 0/2pi ; so I choose to use both sinuses and cosinuses instead. I also scaled the length (to have matching scales for the 3 variables) :

    So the result would be :

    import pandas as pd
    import matplotlib.pyplot as plt
    import numpy as np
    from sklearn.preprocessing import robust_scale
    import hdbscan
    
    df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
    plt.rcParams['image.cmap'] = 'Paired'
    
    
    A = df['A'] #X start
    B = df['B'] #Y start
    C = df['C'] #X arrive
    D = df['D'] #Y arrive
    clusterer = hdbscan.HDBSCAN()
    
    
    df['LENGTH'] = robust_scale(np.sqrt(np.square(df.C-df.A) + np.square(df.D-df.B)))
    df['DIRECTION'] = np.arctan2(df.D-df.B, df.C-df.A)
    df['COS'] = np.cos(df['DIRECTION'])
    df['SIN'] = np.sin(df['DIRECTION'])
    
    
    columns = ['LENGTH', 'COS', 'SIN']
    
    clusterer = hdbscan.HDBSCAN()
    values = df[columns].values
    clusterer.fit_predict(values)
    
    cluster_labels = clusterer.labels_
    num_clusters = len(set(cluster_labels))
    clusters = pd.DataFrame(
            [(values[cluster_labels==n], len(values[cluster_labels==n])) for n in range(num_clusters)],
            columns=["points", "weight"]
            )
    
    
    def get_cmap(n, name='hsv'):
        '''
        Returns a function that maps each index in 0, 1, ..., n-1 to a distinct 
        RGB color; the keyword argument name must be a standard mpl colormap name.
        
        Credits to @Ali
        https://stackoverflow.com/questions/14720331/how-to-generate-random-colors-in-matplotlib#answer-25628397
        '''
        return plt.cm.get_cmap(name, n)
    
    cmap = get_cmap(num_clusters+1)
    colors = {x:cmap(x) for x in range(num_clusters)}
    df['CLUSTER'] = np.nan
    
    
    for x, (cluster, weight) in enumerate(clusters[clusters.weight>0].values.tolist()):
        df_this_cluster = pd.DataFrame(cluster, columns=columns)
        df_this_cluster['TEMP'] = x
        df = df.merge(df_this_cluster, on=columns, how='left')
        df.reset_index(drop=True, inplace=True)
        
        ix = df[df.TEMP.notnull()].index
        df.loc[ix, "CLUSTER"] = df.loc[ix, "TEMP"]
        df.drop("TEMP", axis=1, inplace=True)
        
    df['CLUSTER'] = df['CLUSTER'].fillna(num_clusters-1)
    df['COLOR'] = df['CLUSTER'].map(colors)
    print("Number of clusters : ", num_clusters-1)
    
    nrows = num_clusters//2 if num_clusters%2==0 else num_clusters//2 + 1
    fig,axes = plt.subplots(nrows=nrows, ncols=2)
    axes = [y for row in axes for y in row]
    for k,ax in enumerate(axes):
    
        ax.set_xlim(-5, 25)
        ax.set_ylim(-5, 25)
        ax.set_aspect('equal', adjustable='box')
        if k+1 <num_clusters:
            ax.set_title(f"CLUSTER #{k+1}", fontsize=10)
        this_df = df[df.CLUSTER==k]
        ax.quiver(
            this_df.A, #X
            this_df.B, #Y
            (this_df.C-this_df.A), #X component of vector
            (this_df.D-this_df.B), #Y component of vector
            angles = 'xy', 
            scale_units = 'xy', 
            scale = 1, 
            color=this_df.COLOR
            ) 
    

    The results are way better (though it depends much of the input dataset) ; the last subplots refers to the vectors not being found to be inside a cluster: output


    Edit #2

    If by "direction" you mean angle in the [0..pi[ interval (ie undirected vectors), you will want to include the following code before computing the cosinuses/sinuses :

    ix = df[df.DIRECTION<0].index
    df.loc[ix, "DIRECTION"] += np.pi