python-3.x pandas numpy word-embedding transformer-model

Clustering based on semantic similarity returning no values

I have 'Key_Phrases' as a column in pandas dataframe df. The objective is to cluster them on semantic similarity. I am using SentenceTransformer model.

 df['Key Phrases'] is as follows

                'Key_Phrases'

0              ['BYD' 'Daiwa Capital Markets analyst' 'NIO' 'Order flows'\n 'consumer preferences' 'cost pressures' 'raw materials'\n 'regulatory pressure' 'sales cannibalization' 'sales volume growth'\n 'vehicle batteries']
1              ['CANADA' 'Canada' 'Global Carbon Pricing Challenge'\n 'Major Economies Forum' 'climate finance commitment'\n 'developing countries' 'energy security' 'food security'\n 'international shipping' 'pollution pricing']
2              ['Clean Power Plan' 'EPA' 'Environmental Protection Agency'\n 'Supreme Court' 'Supreme Court decision' 'Virginia' 'West Virginia'\n 'renewable energy' 'tax subsidies']
3              ['BlueOvalSK' 'Ford' 'Ford Motor' 'Kathleen Valley' 'LG Energy' 'Liontown'\n 'Liontown Resources' 'SK Innovation' 'SK On' 'Tesla' 'battery metals'\n 'joint venture' 'lithium spodumene concentrate'\n 'lithium supply agreement']
4              ['Emissions Trading System' 'European Commission' 'European Parliament'\n 'ICIS' 'carbon border adjustment mechanism' 'carbon leakage']
5              ['Digital Industries' 'MG Motor India' 'MindSphere'\n 'Plant Simulation software' 'Siemens' 'carbon footprints'\n 'digitalisation' 'experience' 'intelligent manufacturing'\n 'production efficiency' 'strategic collaborations']
6              ['Malaysia' 'Mosti' 'NTIS' 'National Technology and Innovation Sandbox'\n 'National Urbanisation Policy' 'Sunway Innovation Labs'\n 'Sunway iLabs Super Accelerator' 'economic growth'\n 'memorandum of understanding' 'quality of life' 'safe environment'\n 'smart cities' 'smart city sandbox' 'urban management' 'urban population']
7              ['Artificial Intelligence' 'Electricity and Water Authority'\n 'Green Mobility' 'Grid Automation' 'Internet of Things' 'Smart Dubai'\n 'Smart Energy Solutions' 'Smart Grid' 'Smart Water'\n 'artificial intelligence' 'blockchain' 'connected services'\n 'energy storage' 'integrated systems' 'interoperability' 'smart city'\n 'smart grid' 'sustainability' 'water network']
8              ['Artificial Intelligence' 'Clean Energy Strategy 2050'\n 'Dubai Electricity and Water Authority' 'Green Mobility'\n 'Grid Automation' 'Internet of Things' 'Smart Dubai'\n 'Smart Energy Solutions' 'Smart Grid' 'Smart Water'\n 'Zero Carbon Emissions Strategy' 'artificial intelligence' 'blockchain'\n 'clean energy sources' 'connected services' 'energy storage'\n 'integrated systems' 'interoperability' 'smart city' 'smart grid'\n 'sustainability']

Key_Phrases_list_1 = df['Key Phrases'].tolist()
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('distilbert-base-nli-stsb-quora-ranking')    
#Encoding is done with one simple step
embeddings = model.encode(Key_Phrases_list_1, show_progress_bar=True, convert_to_numpy=True)

Then the following function is created:

def detect_clusters(embeddings, threshold=0.90, min_community_size=20):
    # Compute cosine similarity scores
    cos_scores = util.pytorch_cos_sim(embeddings, embeddings)

    #we filter those scores according to the minimum community size we specified earlier
    # Minimum size for a community
    top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)
    # Filter for rows >= min_threshold
    extracted_communities = []
    for i in range(len(top_k_values)):
        if top_k_values[i][-1] >= threshold:
            new_cluster = []

    # Only check top k most similar entries
            top_val_large, top_idx_large = cos_scores[i].topk(k=init_max_size, largest=True)
            top_idx_large = top_idx_large.tolist()
            top_val_large = top_val_large.tolist()
            
            if top_val_large[-1] < threshold:
                for idx, val in zip(top_idx_large, top_val_large):
                    if val < threshold:
                        break
                        new_cluster.append(idx)
            else:
                # Iterate over all entries (slow)
                for idx, val in enumerate(cos_scores[i].tolist()):
                    if val >= threshold:
                        new_cluster.append(idx)
                        
            extracted_communities.append(new_cluster)

    unique_communities = []
    extracted_ids = set()
        
    for community in extracted_communities:
        add_cluster = True
        for idx in community:
            if idx in extracted_ids:
                add_cluster = False
                break
        if add_cluster:
            unique_communities.append(community)
            for idx in community:
                extracted_ids.add(idx)
    return unique_communities

Then the function is called:

clusters = detect_clusters(embeddings, min_community_size=6, threshold=0.75)

I am getting no values in return. Am I missing anything in the detect_clusters function.

Solution

As the OP asked for a solution where the number of clusters would be automatic selected it is easier to use something more robust like sklearn:

from sentence_transformers import SentenceTransformer, util
import numpy as np
model = SentenceTransformer('distilbert-base-nli-stsb-quora-ranking')
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
def choose_classifier(X):
    X1 = X / (X**2).sum(axis=-1, keepdims=True)
    vv = []
    cc = np.arange(2, len(X))
    for nclusters in cc:
        km_model = KMeans(nclusters).fit(X1)
        labels = km_model.labels_
        v = silhouette_score(X1, labels)
        vv.append(v)
    nclusters = cc[np.argmax(vv)]
    return KMeans(nclusters).fit(X1)

Use it like this

phrases = [
    'I like ice cream',
    'I like cake',
    'You are so kind',
    'You are very intelligent'
]
embeddings = model.encode(phrases, show_progress_bar=True, convert_to_numpy=True)

classifier = choose_classifier(embeddings)

for i, (v, s) in enumerate(zip(embeddings, phrases)):
    print(classifier.predict(v[np.newaxis]), s)

[1] I like ice cream
[1] I like cake
[0] You are so kind
[0] You are very intelligent

GPU capable solution

At a first sight I couldn't grasp all you are doing in your code, but let me suggest you some simplified method. I use pytorch_kmeans, and I explore the fact that the squared euclidean distance is dot(A-B,A-B) = dot(A,A) + dot(B,B) - 2 * dot(A, B), and that cosine similarity is dot(A, B) / sqrt(dot(A,A) * dot(B,B)). So (1) multiplying A or B by a scalar does not change cosine similarity, (2) if A and B have the same length, minimizing euclidean maximizes cosine similarity. Given the set of vectors you want to cluster you can (1) normalize all of them, making them the same length, (2) compute the clusters that minimize euclidean distance. Then you have the clusters that maximize cosine similarity.

pip install kmeans_pytorch

Setup

Since you didnt' give data I will generate an example myself

import torch;
# 2D Example Data
# Generate some random data in three clusters
NPC=10
X = torch.cat([
  (torch.randn((NPC, 2)) + c) * (torch.rand((NPC,1))**2+1)/2 
      for c in torch.tensor([[5,3], [-7,0], [-0, -7]])])

Solution

This is the code

from kmeans_pytorch import kmeans
import torch
def detect_clusters(X, nclusters, tol=1e-6):
  X = torch.as_tensor(X)
  assert X.ndim == 2
  # Project the points in a hypersphere
  X1 = X / torch.sqrt(torch.sum(X**2, axis=-1, keepdims=True))

  # Run kmeans on the normalized points with euclidean distance
  cluster_ID, C = kmeans(X1, nclusters, distance='euclidean', tol=tol)
  return cluster_ID, C

Example visualization

import matplotlib.pyplot as plt
import numpy as np
import torch;

#### THE RESUTLS ####
cluster_ID, C = detect_clusters(X, 3)
# Avoid distortion of the angles
plt.axes().set_aspect('equal')
# Initial points
plt.plot(X[:,0], X[:,1], '.')
# Reference circle
theta = torch.linspace(0, 2*np.pi, 1000)
plt.plot(torch.cos(theta), torch.sin(theta), '--k')
plt.plot(X1[:,0], X1[:,1], '.')
xlim = plt.xlim()
ylim = plt.ylim()
plt.xlim(xlim)
plt.ylim(ylim)

# Draw lines in the directions given by the centroids
R = 20
for c in C:
    plt.plot([0, c[0]*R], [0, c[1]*R]);

plt.grid();

Using with sentence embeddings

Some example embeddings

from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('distilbert-base-nli-stsb-quora-ranking')   


phrases = [
    'I like ice cream',
    'I like cake',
    'You are so kind',
    'You are very intelligent'
]
embeddings = model.encode(phrases, show_progress_bar=True, convert_to_numpy=True)

Then you can pass the embeddings to the detect_cluster function I provided above

label, center = detect_clusters(torch.as_tensor(embeddings), 2)
for c, s in zip(label, phrases):
    print(f'[{c}] {s}')

That should give give you the sentences with the corresponding cluster

[0] I like ice cream
[0] I like cake
[1] You are so kind
[1] You are very intelligent