Search code examples
pythonscikit-learndbscan

Separate Clearly Defined clusters with DBSCAN


I have a dataset with 4 features,with features (1,4) and (2,4) clearly separable. enter image description here

I am trying to use DBSCAN to come up with the Clusters, however I am unable to create satisfatocty clusters.

Here is the code snippet where I:

  • iterate over all combinations of eps and min_sample values.
  • Run DBSCAN
  • save the results if Clustes are more than 1, and less than 7
#### STEP 4: DBSCAN ####
# Define the parameter combinations to evaluate
eps_values = [0.01, 0.03, 0.05, 0.07, 0.1, 0.15]
min_samples_values = [2, 3, 5, 7, 10, 15]

# Iterate over parameter combinations
names = []
for eps, min_samples in itertools.product(eps_values, min_samples_values):
    # Create a DBSCAN object with the current parameter values
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)

    # Fit the DBSCAN model to the data and obtain the cluster labels
    cluster_labels = dbscan.fit_predict(df_t[new_features])
    if len(pd.Series(cluster_labels).unique()) > 1: 
        if len(pd.Series(cluster_labels).unique()) < 7: 
            name = f"eps_{eps}_mins_{min_samples}"
            df_t[name] = cluster_labels
            names.append(name)
    # Filter out the outliers (-1 label) from the cluster labels
    filtered_labels = cluster_labels[cluster_labels != -1]
    print("Eps:", eps, "Min Samples:", min_samples, "clusters:", len(pd.Series(filtered_labels).unique()))

Here I am plotting the reuslts for clusters that have more than 1, less than 7 clusters. As you can see none of the param gave satisfactory clusters that look like the origianl data. enter image description here

Q: is it the code/setup that is making it unable to cluster properely?

Here is the complete code that reproduces the example, for completeness. the steps are:

  • import the data
  • scale using minmax
  • create new features using kernel PCA
  • run DBSCAN the first 3 steps are just to setup the dataframe DATA to have hte correct features.
import pandas as pd
import requests
import zipfile
import seaborn as sns,  matplotlib.pyplot as plt


from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import MDS
from sklearn.cluster import DBSCAN


from itertools import combinations
import itertools 
%matplotlib inline

#### SETUP TO MAKE THE DATA ####


#### STEP1: IMPORT THE DATA ####

# Specify the URL of the ZIP file
zip_url = 'https://archive.ics.uci.edu/static/public/267/banknote+authentication.zip'

# Download the ZIP file
response = requests.get(zip_url)

# Save the ZIP file locally
zip_path = 'banknote_authentication.zip'
with open(zip_path, 'wb') as f:
    f.write(response.content)

# Extract the contents of the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall()

# Specify the path to the extracted CSV file
csv_path = 'data_banknote_authentication.txt'
column_names = ['variance', 'skewness', 'curtosis', 'entropy', 'original']
features  = ['variance', 'skewness', 'curtosis', 'entropy']
df = pd.read_csv(csv_path, names=column_names)



##### STEP2: SCALE THE DATA ####
mms = MinMaxScaler()
data = df.copy()
for col in features:
    data[col] = mms.fit_transform(data[[col]]).squeeze()


### STEP 3: TRANFORM KERNEL PCA ####

embedding =  MDS(n_components=4,max_iter=300, random_state=10)

X_transformed = embedding.fit_transform(data[features])
new_features = ["1","2", "3", "4"]
df_t=pd.DataFrame(X_transformed , columns=new_features)
df_t['original'] = data["original"]



### SHOW THE DATA 
sns.set_context('notebook')
sns.set_style('white')
sns.pairplot(df_t, hue="original")


### CODE FOR MAKING DBSCAN AND PLOTS 

#### STEP 4: DBSCAN ####
# Define the parameter combinations to evaluate
eps_values = [0.01, 0.03, 0.05, 0.07, 0.1, 0.15]
min_samples_values = [2, 3, 5, 7, 10, 15]

# Iterate over parameter combinations
names = []
for eps, min_samples in itertools.product(eps_values, min_samples_values):
    # Create a DBSCAN object with the current parameter values
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)

    # Fit the DBSCAN model to the data and obtain the cluster labels
    cluster_labels = dbscan.fit_predict(df_t[new_features])
    if len(pd.Series(cluster_labels).unique()) > 1: 
        if len(pd.Series(cluster_labels).unique()) < 7: 
            name = f"eps_{eps}_mins_{min_samples}"
            df_t[name] = cluster_labels
            names.append(name)


    # Filter out the outliers (-1 label) from the cluster labels
    filtered_labels = cluster_labels[cluster_labels != -1]
    print("Eps:", eps, "Min Samples:", min_samples, "clusters:", len(pd.Series(filtered_labels).unique()))





###### PLOT THE DBSCAN RESULS #### 
df_plot = df_t.melt(id_vars =new_features, value_vars =['original'] + names , var_name = "cluster")
df_plot['value']= df_plot['value'].astype(str)


# Create a 3 by 3 subplot grid
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(12, 12))

# Flatten the axes for easy iteration
axes = axes.flatten()

# Iterate over each cluster and create scatter plots
for i, cluster in enumerate(df_plot['cluster'].unique()):
    print (i)
    ax = axes[i]  # Select the current subplot

    # Filter data for the current cluster
    subset = df_plot[df_plot['cluster'] == cluster]

    # Create scatter plot
    sns.scatterplot(data=subset, x="2", y="4", hue='value', legend='full', ax=ax)

    # Set subplot title
    ax.set_title(f"Cluster {cluster}", fontsize=12)

    # Set axis labels
    ax.set_xlabel("x")
    ax.set_ylabel("y")

    # Remove x and y ticks
    ax.set_xticks([])
    ax.set_yticks([])

# Adjust spacing between subplots
plt.tight_layout()

plt.show()

Answering Gijs Wobben question: what's the point in having MDS with the same number of components as before?

From a lecture i am following, i was hoping to use MDS with the same number of dimensions to separate better the classes, helping with the clustering alghorithms. Here you can see how in this example provided in the lecture, the data is reorganized in a more visually separable manner. enter image description here


Solution

  • It's not super clear what it is you hope to find but in general you want to select scaling, dimensionality reduction and clustering methods that create the best separation for your use case. For example, if you want to separate the data into clusters based on the target you could use Min-Max scaling (so no negative values exist anymore), apply TSNE to get the best separation for a given target variable, and run KMeans on the result with a fixed number of clusters:

    import zipfile
    from pathlib import Path
    
    import numpy as np
    import pandas as pd
    import requests
    from sklearn.cluster import KMeans
    from sklearn.manifold import TSNE
    from sklearn.metrics import confusion_matrix
    from sklearn.preprocessing import MinMaxScaler
    
    # Fix the random seed for reproducibility
    SEED = 42
    np.random.seed(SEED)
    
    ##### STEP 1: IMPORT THE DATA #####
    
    data_path = Path(__file__).parent / "data_banknote_authentication.txt"
    if not data_path.exists():
        print("Download the data")
        # Specify the URL of the ZIP file
        zip_url = (
            "https://archive.ics.uci.edu/static/public/267/banknote+authentication.zip"
        )
    
        # Download the ZIP file
        response = requests.get(zip_url)
    
        # Save the ZIP file locally
        zip_path = "banknote_authentication.zip"
        with open(zip_path, "wb") as f:
            f.write(response.content)
    
        # Extract the contents of the ZIP file
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall()
    
    print("Load the data")
    column_names = ["variance", "skewness", "curtosis", "entropy", "original"]
    features = ["variance", "skewness", "curtosis", "entropy"]
    target = "original"
    df = pd.read_csv(data_path, names=column_names)
    
    
    ##### STEP 2: SCALE THE DATA #####
    
    # Min-max scaling
    print("Scale the data (min-max)")
    min_max_scaler = MinMaxScaler()
    scaled_min_max = min_max_scaler.fit_transform(df[features])
    features_min_max = [f"{feature}_scaled_min_max" for feature in features]
    df = pd.concat(
        [
            df,
            pd.DataFrame(
                scaled_min_max,
                columns=features_min_max,
            ),
        ],
        axis=1,
    )
    
    ##### STEP 3: DIMENSIONALITY REDUCTION ####
    
    # T-SNE (or any other method that yields good separation)
    print("Perform dimensionality reduction (T-SNE)")
    multi_dimensional_scaler = TSNE(
        n_components=2,  # How may clusters do you hope to find?
        random_state=SEED,
    )
    scaled_multi_dimensional = multi_dimensional_scaler.fit_transform(df[features])
    features_multi_dimensional = [
        f"tsne_component_{i}" for i in range(scaled_multi_dimensional.shape[1])
    ]
    df = pd.concat(
        [
            df,
            pd.DataFrame(
                scaled_multi_dimensional,
                columns=features_multi_dimensional,
            ),
        ],
        axis=1,
    )
    
    # # Optional: Show the separation
    # df.plot.scatter(
    #     x=features_multi_dimensional[0],
    #     y=features_multi_dimensional[1],
    #     c=target,
    #     colormap="viridis",
    # )
    
    
    ##### STEP 4: CLUSTERING #####
    
    # K-means (or any other clustering method)
    clustering = KMeans(
        n_clusters=2,  # How may clusters do you hope to find?
        n_init="auto",
        random_state=SEED,
    )
    df["cluster"] = clustering.fit_predict(X=df[features_multi_dimensional], y=df[target])
    
    # # Optional: Show clusters over the original data
    # plot = pd.plotting.scatter_matrix(
    #     df[features],
    #     c=df["cluster"],
    #     figsize=(10, 10),
    #     marker="o",
    #     hist_kwds={"bins": 20},
    #     s=60,
    #     alpha=0.8,
    # )
    # plt.show()
    
    # Print the confusion matrix to show that the clusters mimic the target
    print(confusion_matrix(df[target], df["cluster"]))
    

    Small notes on your original code:

    • What's the point in having MDS with the same number of components as before?
    • Why did you choose DBScan over other methods?