Search code examples
pythonword-embeddingcosine-similarity

Calculating similarities of text embeddings using CLIP


I am trying to use CLIP to calculate the similarities between strings. (I know that CLIP is usually used with text and images but it should work with only strings as well.)

I provide a list of simple text prompts and calculate the similarity between their embeddings. The similarities are off but I can't figure what I'm doing wrong.

import torch
import clip
from torch.nn import CosineSimilarity

cos = CosineSimilarity(dim=1, eps=1e-6)

def gen_features(model, text):
    tokens = clip.tokenize([text]).to(device)
    text_features = model.encode_text(tokens)

    return text_features

def dist(v1, v2):
    #return torch.dist(normalize(v1), normalize(v2)) # euclidean distance
    #return cos(normalize(v1), normalize(v2)).item() # cosine similarity

    similarity = (normalize(v1) @ normalize(v2).T)

    return similarity.item()



device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "ViT-B/32"
model, _ = clip.load(model_name, device=device)

sentences = ["A cat", "A dog", "A labrador", "A poodle", "A wolf", "A lion", "A house"]

with torch.no_grad():
    embeddings = [(sentence, gen_features(model, sentence)) for sentence in sentences]
    for label1, embedding1 in embeddings:
        for label2, embedding2 in embeddings:
            print(f"{label1} -> {label2}: {dist(embedding1, embedding2)}")


Output


    A cat -> A cat: 0.9999998211860657
    A cat -> A dog: 0.9361147880554199
    A cat -> A labrador: 0.8170720934867859
    A cat -> A poodle: 0.8438302278518677
    A cat -> A wolf: 0.9086413979530334
    A cat -> A lion: 0.8914517164230347
    A cat -> A house: 0.8724125027656555
    A dog -> A cat: 0.9361147880554199
    A dog -> A dog: 1.0000004768371582
    A dog -> A labrador: 0.8481228351593018
    A dog -> A poodle: 0.9010260105133057
    A dog -> A wolf: 0.9260395169258118
    A dog -> A lion: 0.886112630367279
    A dog -> A house: 0.8852840662002563
    A labrador -> A cat: 0.8170720934867859
    A labrador -> A dog: 0.8481228351593018
    A labrador -> A labrador: 1.000000238418579
    A labrador -> A poodle: 0.7722526788711548
    A labrador -> A wolf: 0.8111101984977722
    A labrador -> A lion: 0.783727765083313
    A labrador -> A house: 0.7569846510887146
    A poodle -> A cat: 0.8438302278518677
    A poodle -> A dog: 0.9010260105133057
    A poodle -> A labrador: 0.7722526788711548
    A poodle -> A poodle: 0.999999463558197
    A poodle -> A wolf: 0.8539597988128662
    A poodle -> A lion: 0.8460092544555664
    A poodle -> A house: 0.8119628429412842
    A wolf -> A cat: 0.9086413979530334
    A wolf -> A dog: 0.9260395169258118
    A wolf -> A labrador: 0.8111101984977722
    A wolf -> A poodle: 0.8539597988128662
    A wolf -> A wolf: 1.000000238418579
    A wolf -> A lion: 0.9043934941291809
    A wolf -> A house: 0.860664427280426
    A lion -> A cat: 0.8914517164230347
    A lion -> A dog: 0.886112630367279
    A lion -> A labrador: 0.783727765083313
    A lion -> A poodle: 0.8460092544555664
    A lion -> A wolf: 0.9043934941291809
    A lion -> A lion: 1.0000004768371582
    A lion -> A house: 0.8402873873710632
    A house -> A cat: 0.8724125027656555
    A house -> A dog: 0.8852840662002563
    A house -> A labrador: 0.7569846510887146
    A house -> A poodle: 0.8119628429412842
    A house -> A wolf: 0.860664427280426
    A house -> A lion: 0.8402873873710632
    A house -> A house: 0.9999997615814209


The results show that a dog is closer to a house than it is for a labrador 0.885 vs 0.848 which doesn't make sense. I've tried cosine similarity and euclidean distance to check whether the distance measure was wrong, but the results are similar. Where am I going wrong?


Solution

  • If you use the text embeddings from the output of CLIPTextModel ([number of prompts, 77, 512]), flatten them ([number of prompts, 39424]) and the apply cosine similarity, you'll get improved results.

    This code lets you test both solutions ([1,512] and [77,512]). I'm running it on Google Colab.

        !pip install -U torch transformers 
        
        import torch
        from torch.nn import CosineSimilarity
        from transformers import CLIPTokenizer, CLIPModel, CLIPTextModel
        cossim = CosineSimilarity(dim=0, eps=1e-6)
        
        def dist(v1, v2):
          return cossim(v1, v2)
        
        torch_device = "cuda" if torch.cuda.is_available() else "cpu"
        
        models = [
            'openai/clip-vit-base-patch16',
            'openai/clip-vit-base-patch32',
            'openai/clip-vit-large-patch14',
        ]
        
        model_id = models[1]
        
        tokenizer = CLIPTokenizer.from_pretrained(model_id)
        text_encoder = CLIPTextModel.from_pretrained(model_id).to(torch_device)
        model = CLIPModel.from_pretrained(model_id).to(torch_device)
        
        prompts = [
          "A cat", "A dog", "A labrador", "A poodle", "A wolf", "A lion", "A house",
        ] 
        
        text_inputs = tokenizer(
            prompts, 
            padding="max_length", 
            return_tensors="pt",
            ).to(torch_device)
        text_features = model.get_text_features(**text_inputs)
        text_embeddings = torch.flatten(text_encoder(text_inputs.input_ids.to(torch_device))['last_hidden_state'],1,-1)
        
        print("\n\nusing text_features")
        for i1, label1 in enumerate(prompts):
          for i2, label2 in enumerate(prompts):
            if (i2>=i1):
              print(f"{label1} <-> {label2} = {dist(text_features[i1], text_features[i2]):.4f}")
        
        print("\n\nusing text_embeddings")
        for i1, label1 in enumerate(prompts):
          for i2, label2 in enumerate(prompts):
            if (i2>=i1):
              print(f"{label1} <-> {label2} = {dist(text_embeddings[i1], text_embeddings[i2]):.4f}")
    
    

    You'll get the same values for the [1,512] embedding

    A cat <-> A cat = 1.0000
    A cat <-> A dog = 0.9361
    A cat <-> A labrador = 0.8171
    A cat <-> A poodle = 0.8438
    A cat <-> A wolf = 0.9086
    A cat <-> A lion = 0.8915
    A cat <-> A house = 0.8724
    A dog <-> A dog = 1.0000
    **A dog <-> A labrador = 0.8481**
    A dog <-> A poodle = 0.9010
    A dog <-> A wolf = 0.9260
    A dog <-> A lion = 0.8861
    **A dog <-> A house = 0.8853**
    A labrador <-> A labrador = 1.0000
    A labrador <-> A poodle = 0.7723
    A labrador <-> A wolf = 0.8111
    A labrador <-> A lion = 0.7837
    A labrador <-> A house = 0.7570
    A poodle <-> A poodle = 1.0000
    A poodle <-> A wolf = 0.8540
    A poodle <-> A lion = 0.8460
    A poodle <-> A house = 0.8120
    A wolf <-> A wolf = 1.0000
    A wolf <-> A lion = 0.9044
    A wolf <-> A house = 0.8607
    A lion <-> A lion = 1.0000
    A lion <-> A house = 0.8403
    A house <-> A house = 1.0000
    

    But the results have improved with the [1,77,512] embedding, and now the dog is closer to the labrador than to the house. Still, you'll get funny results such as the cat being more similar to a house than to a poodle.

    A cat <-> A cat = 1.0000
    A cat <-> A dog = 0.8880
    A cat <-> A labrador = 0.8057
    A cat <-> A poodle = 0.7579
    A cat <-> A wolf = 0.8558
    A cat <-> A lion = 0.8358
    A cat <-> A house = 0.8024
    A dog <-> A dog = 1.0000
    **A dog <-> A labrador = 0.8794**
    A dog <-> A poodle = 0.8583
    A dog <-> A wolf = 0.8888
    A dog <-> A lion = 0.8265
    **A dog <-> A house = 0.8294**
    A labrador <-> A labrador = 1.0000
    A labrador <-> A poodle = 0.8006
    A labrador <-> A wolf = 0.8182
    A labrador <-> A lion = 0.7958
    A labrador <-> A house = 0.7608
    A poodle <-> A poodle = 1.0000
    A poodle <-> A wolf = 0.7928
    A poodle <-> A lion = 0.7735
    A poodle <-> A house = 0.7623
    A wolf <-> A wolf = 1.0000
    A wolf <-> A lion = 0.8496
    A wolf <-> A house = 0.8063
    A lion <-> A lion = 1.0000
    A lion <-> A house = 0.7671
    A house <-> A house = 1.0000