Search code examples
python-3.xpandasnumpyword-embeddingtransformer-model

Clustering based on semantic similarity returning no values


I have 'Key_Phrases' as a column in pandas dataframe df. The objective is to cluster them on semantic similarity. I am using SentenceTransformer model.

 df['Key Phrases'] is as follows

                'Key_Phrases'

0              ['BYD' 'Daiwa Capital Markets analyst' 'NIO' 'Order flows'\n 'consumer preferences' 'cost pressures' 'raw materials'\n 'regulatory pressure' 'sales cannibalization' 'sales volume growth'\n 'vehicle batteries']
1              ['CANADA' 'Canada' 'Global Carbon Pricing Challenge'\n 'Major Economies Forum' 'climate finance commitment'\n 'developing countries' 'energy security' 'food security'\n 'international shipping' 'pollution pricing']
2              ['Clean Power Plan' 'EPA' 'Environmental Protection Agency'\n 'Supreme Court' 'Supreme Court decision' 'Virginia' 'West Virginia'\n 'renewable energy' 'tax subsidies']
3              ['BlueOvalSK' 'Ford' 'Ford Motor' 'Kathleen Valley' 'LG Energy' 'Liontown'\n 'Liontown Resources' 'SK Innovation' 'SK On' 'Tesla' 'battery metals'\n 'joint venture' 'lithium spodumene concentrate'\n 'lithium supply agreement']
4              ['Emissions Trading System' 'European Commission' 'European Parliament'\n 'ICIS' 'carbon border adjustment mechanism' 'carbon leakage']
5              ['Digital Industries' 'MG Motor India' 'MindSphere'\n 'Plant Simulation software' 'Siemens' 'carbon footprints'\n 'digitalisation' 'experience' 'intelligent manufacturing'\n 'production efficiency' 'strategic collaborations']
6              ['Malaysia' 'Mosti' 'NTIS' 'National Technology and Innovation Sandbox'\n 'National Urbanisation Policy' 'Sunway Innovation Labs'\n 'Sunway iLabs Super Accelerator' 'economic growth'\n 'memorandum of understanding' 'quality of life' 'safe environment'\n 'smart cities' 'smart city sandbox' 'urban management' 'urban population']
7              ['Artificial Intelligence' 'Electricity and Water Authority'\n 'Green Mobility' 'Grid Automation' 'Internet of Things' 'Smart Dubai'\n 'Smart Energy Solutions' 'Smart Grid' 'Smart Water'\n 'artificial intelligence' 'blockchain' 'connected services'\n 'energy storage' 'integrated systems' 'interoperability' 'smart city'\n 'smart grid' 'sustainability' 'water network']
8              ['Artificial Intelligence' 'Clean Energy Strategy 2050'\n 'Dubai Electricity and Water Authority' 'Green Mobility'\n 'Grid Automation' 'Internet of Things' 'Smart Dubai'\n 'Smart Energy Solutions' 'Smart Grid' 'Smart Water'\n 'Zero Carbon Emissions Strategy' 'artificial intelligence' 'blockchain'\n 'clean energy sources' 'connected services' 'energy storage'\n 'integrated systems' 'interoperability' 'smart city' 'smart grid'\n 'sustainability']

Key_Phrases_list_1 = df['Key Phrases'].tolist()
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('distilbert-base-nli-stsb-quora-ranking')    
#Encoding is done with one simple step
embeddings = model.encode(Key_Phrases_list_1, show_progress_bar=True, convert_to_numpy=True)

Then the following function is created:

def detect_clusters(embeddings, threshold=0.90, min_community_size=20):
    # Compute cosine similarity scores
    cos_scores = util.pytorch_cos_sim(embeddings, embeddings)

    #we filter those scores according to the minimum community size we specified earlier
    # Minimum size for a community
    top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)
    # Filter for rows >= min_threshold
    extracted_communities = []
    for i in range(len(top_k_values)):
        if top_k_values[i][-1] >= threshold:
            new_cluster = []

    # Only check top k most similar entries
            top_val_large, top_idx_large = cos_scores[i].topk(k=init_max_size, largest=True)
            top_idx_large = top_idx_large.tolist()
            top_val_large = top_val_large.tolist()
            
            if top_val_large[-1] < threshold:
                for idx, val in zip(top_idx_large, top_val_large):
                    if val < threshold:
                        break
                        new_cluster.append(idx)
            else:
                # Iterate over all entries (slow)
                for idx, val in enumerate(cos_scores[i].tolist()):
                    if val >= threshold:
                        new_cluster.append(idx)
                        
            extracted_communities.append(new_cluster)

    unique_communities = []
    extracted_ids = set()
        
    for community in extracted_communities:
        add_cluster = True
        for idx in community:
            if idx in extracted_ids:
                add_cluster = False
                break
        if add_cluster:
            unique_communities.append(community)
            for idx in community:
                extracted_ids.add(idx)
    return unique_communities

Then the function is called:

clusters = detect_clusters(embeddings, min_community_size=6, threshold=0.75)

I am getting no values in return. Am I missing anything in the detect_clusters function.


Solution

  • As the OP asked for a solution where the number of clusters would be automatic selected it is easier to use something more robust like sklearn:

    from sentence_transformers import SentenceTransformer, util
    import numpy as np
    model = SentenceTransformer('distilbert-base-nli-stsb-quora-ranking')
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score
    def choose_classifier(X):
        X1 = X / (X**2).sum(axis=-1, keepdims=True)
        vv = []
        cc = np.arange(2, len(X))
        for nclusters in cc:
            km_model = KMeans(nclusters).fit(X1)
            labels = km_model.labels_
            v = silhouette_score(X1, labels)
            vv.append(v)
        nclusters = cc[np.argmax(vv)]
        return KMeans(nclusters).fit(X1)
    
    

    Use it like this

    phrases = [
        'I like ice cream',
        'I like cake',
        'You are so kind',
        'You are very intelligent'
    ]
    embeddings = model.encode(phrases, show_progress_bar=True, convert_to_numpy=True)
    
    classifier = choose_classifier(embeddings)
    
    for i, (v, s) in enumerate(zip(embeddings, phrases)):
        print(classifier.predict(v[np.newaxis]), s)
    
    [1] I like ice cream
    [1] I like cake
    [0] You are so kind
    [0] You are very intelligent
    

    GPU capable solution

    At a first sight I couldn't grasp all you are doing in your code, but let me suggest you some simplified method. I use pytorch_kmeans, and I explore the fact that the squared euclidean distance is dot(A-B,A-B) = dot(A,A) + dot(B,B) - 2 * dot(A, B), and that cosine similarity is dot(A, B) / sqrt(dot(A,A) * dot(B,B)). So (1) multiplying A or B by a scalar does not change cosine similarity, (2) if A and B have the same length, minimizing euclidean maximizes cosine similarity. Given the set of vectors you want to cluster you can (1) normalize all of them, making them the same length, (2) compute the clusters that minimize euclidean distance. Then you have the clusters that maximize cosine similarity.

    pip install kmeans_pytorch
    

    Setup

    Since you didnt' give data I will generate an example myself

    import torch;
    # 2D Example Data
    # Generate some random data in three clusters
    NPC=10
    X = torch.cat([
      (torch.randn((NPC, 2)) + c) * (torch.rand((NPC,1))**2+1)/2 
          for c in torch.tensor([[5,3], [-7,0], [-0, -7]])])
    

    Solution

    This is the code

    from kmeans_pytorch import kmeans
    import torch
    def detect_clusters(X, nclusters, tol=1e-6):
      X = torch.as_tensor(X)
      assert X.ndim == 2
      # Project the points in a hypersphere
      X1 = X / torch.sqrt(torch.sum(X**2, axis=-1, keepdims=True))
    
      # Run kmeans on the normalized points with euclidean distance
      cluster_ID, C = kmeans(X1, nclusters, distance='euclidean', tol=tol)
      return cluster_ID, C
    

    Example visualization

    import matplotlib.pyplot as plt
    import numpy as np
    import torch;
    
    #### THE RESUTLS ####
    cluster_ID, C = detect_clusters(X, 3)
    # Avoid distortion of the angles
    plt.axes().set_aspect('equal')
    # Initial points
    plt.plot(X[:,0], X[:,1], '.')
    # Reference circle
    theta = torch.linspace(0, 2*np.pi, 1000)
    plt.plot(torch.cos(theta), torch.sin(theta), '--k')
    plt.plot(X1[:,0], X1[:,1], '.')
    xlim = plt.xlim()
    ylim = plt.ylim()
    plt.xlim(xlim)
    plt.ylim(ylim)
    
    # Draw lines in the directions given by the centroids
    R = 20
    for c in C:
        plt.plot([0, c[0]*R], [0, c[1]*R]);
    
    plt.grid();
    

    enter image description here enter image description here

    Using with sentence embeddings

    Some example embeddings

    from sentence_transformers import SentenceTransformer, util
    import numpy as np
    
    model = SentenceTransformer('distilbert-base-nli-stsb-quora-ranking')   
    
    
    phrases = [
        'I like ice cream',
        'I like cake',
        'You are so kind',
        'You are very intelligent'
    ]
    embeddings = model.encode(phrases, show_progress_bar=True, convert_to_numpy=True)
    

    Then you can pass the embeddings to the detect_cluster function I provided above

    label, center = detect_clusters(torch.as_tensor(embeddings), 2)
    for c, s in zip(label, phrases):
        print(f'[{c}] {s}')
    

    That should give give you the sentences with the corresponding cluster

    [0] I like ice cream
    [0] I like cake
    [1] You are so kind
    [1] You are very intelligent