Search code examples
pythonnlpartificial-intelligenceword-embeddingbert-language-model

How to cluster similar sentences using BERT


For ElMo, FastText and Word2Vec, I'm averaging the word embeddings within a sentence and using HDBSCAN/KMeans clustering to group similar sentences.

A good example of the implementation can be seen in this short article: http://ai.intelligentonlinetools.com/ml/text-clustering-word-embedding-machine-learning/

I would like to do the same thing using BERT (using the BERT python package from hugging face), however I am rather unfamiliar with how to extract the raw word/sentence vectors in order to input them into a clustering algorithm. I know that BERT can output sentence representations - so how would I actually extract the raw vectors from a sentence?

Any information would be helpful.


Solution

  • As Subham Kumar mentioned, one can use this Python 3 library to compute sentence similarity: https://github.com/UKPLab/sentence-transformers

    The library has a few code examples to perform clustering:

    fast_clustering.py:

    """
    This is a more complex example on performing clustering on large scale dataset.
    
    This examples find in a large set of sentences local communities, i.e., groups of sentences that are highly
    similar. You can freely configure the threshold what is considered as similar. A high threshold will
    only find extremely similar sentences, a lower threshold will find more sentence that are less similar.
    
    A second parameter is 'min_community_size': Only communities with at least a certain number of sentences will be returned.
    
    The method for finding the communities is extremely fast, for clustering 50k sentences it requires only 5 seconds (plus embedding comuptation).
    
    In this example, we download a large set of questions from Quora and then find similar questions in this set.
    """
    from sentence_transformers import SentenceTransformer, util
    import os
    import csv
    import time
    
    
    # Model for computing sentence embeddings. We use one trained for similar questions detection
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    
    # We donwload the Quora Duplicate Questions Dataset (https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs)
    # and find similar question in it
    url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
    dataset_path = "quora_duplicate_questions.tsv"
    max_corpus_size = 50000 # We limit our corpus to only the first 50k questions
    
    
    # Check if the dataset exists. If not, download and extract
    # Download dataset if needed
    if not os.path.exists(dataset_path):
        print("Download dataset")
        util.http_get(url, dataset_path)
    
    # Get all unique sentences from the file
    corpus_sentences = set()
    with open(dataset_path, encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
        for row in reader:
            corpus_sentences.add(row['question1'])
            corpus_sentences.add(row['question2'])
            if len(corpus_sentences) >= max_corpus_size:
                break
    
    corpus_sentences = list(corpus_sentences)
    print("Encode the corpus. This might take a while")
    corpus_embeddings = model.encode(corpus_sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True)
    
    
    print("Start clustering")
    start_time = time.time()
    
    #Two parameters to tune:
    #min_cluster_size: Only consider cluster that have at least 25 elements
    #threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
    clusters = util.community_detection(corpus_embeddings, min_community_size=25, threshold=0.75)
    
    print("Clustering done after {:.2f} sec".format(time.time() - start_time))
    
    #Print for all clusters the top 3 and bottom 3 elements
    for i, cluster in enumerate(clusters):
        print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
        for sentence_id in cluster[0:3]:
            print("\t", corpus_sentences[sentence_id])
        print("\t", "...")
        for sentence_id in cluster[-3:]:
            print("\t", corpus_sentences[sentence_id])
    
    

    kmeans.py:

    """
    This is a simple application for sentence embeddings: clustering
    
    Sentences are mapped to sentence embeddings and then k-mean clustering is applied.
    """
    from sentence_transformers import SentenceTransformer
    from sklearn.cluster import KMeans
    
    embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    
    # Corpus with example sentences
    corpus = ['A man is eating food.',
              'A man is eating a piece of bread.',
              'A man is eating pasta.',
              'The girl is carrying a baby.',
              'The baby is carried by the woman',
              'A man is riding a horse.',
              'A man is riding a white horse on an enclosed ground.',
              'A monkey is playing drums.',
              'Someone in a gorilla costume is playing a set of drums.',
              'A cheetah is running behind its prey.',
              'A cheetah chases prey on across a field.'
              ]
    corpus_embeddings = embedder.encode(corpus)
    
    # Perform kmean clustering
    num_clusters = 5
    clustering_model = KMeans(n_clusters=num_clusters)
    clustering_model.fit(corpus_embeddings)
    cluster_assignment = clustering_model.labels_
    
    clustered_sentences = [[] for i in range(num_clusters)]
    for sentence_id, cluster_id in enumerate(cluster_assignment):
        clustered_sentences[cluster_id].append(corpus[sentence_id])
    
    for i, cluster in enumerate(clustered_sentences):
        print("Cluster ", i+1)
        print(cluster)
        print("")
    

    agglomerative.py:

    """
    This is a simple application for sentence embeddings: clustering
    
    Sentences are mapped to sentence embeddings and then agglomerative clustering with a threshold is applied.
    """
    from sentence_transformers import SentenceTransformer
    from sklearn.cluster import AgglomerativeClustering
    import numpy as np
    
    embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    
    # Corpus with example sentences
    corpus = ['A man is eating food.',
              'A man is eating a piece of bread.',
              'A man is eating pasta.',
              'The girl is carrying a baby.',
              'The baby is carried by the woman',
              'A man is riding a horse.',
              'A man is riding a white horse on an enclosed ground.',
              'A monkey is playing drums.',
              'Someone in a gorilla costume is playing a set of drums.',
              'A cheetah is running behind its prey.',
              'A cheetah chases prey on across a field.'
              ]
    corpus_embeddings = embedder.encode(corpus)
    
    # Normalize the embeddings to unit length
    corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
    
    # Perform kmean clustering
    clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5) #, affinity='cosine', linkage='average', distance_threshold=0.4)
    clustering_model.fit(corpus_embeddings)
    cluster_assignment = clustering_model.labels_
    
    clustered_sentences = {}
    for sentence_id, cluster_id in enumerate(cluster_assignment):
        if cluster_id not in clustered_sentences:
            clustered_sentences[cluster_id] = []
    
        clustered_sentences[cluster_id].append(corpus[sentence_id])
    
    for i, cluster in clustered_sentences.items():
        print("Cluster ", i+1)
        print(cluster)
        print("")