Search code examples
pythonpandasnlptext-miningsimilarity

catelog sentences into 5 words that represent them


I have dataframe with 1000 text rows. df['text']

I also have 5 words that I want to know for each one of them how much they represnt the text (between 0 to 1)

every score will be in df["word1"] ,df["word2"] and etc

I will glad for recomendations how to do that

edit

represnt = the semantic distance between the word to the text.

for example - lets say in row 1 the text is "i want to eat" and I have 2 words : food and house.

so in df["food "] it would be higher score than in df["house"]


Solution

  • You could use a pre-trained sentence transformer model from sentence_transformers:

    import pandas as pd
    from sentence_transformers import SentenceTransformer, util
    
    
    class SemanticSimilarityCalculator:
      def __init__(self, model_name: str = 'all-MiniLM-L6-v2') -> None:
        self.model = SentenceTransformer(model_name)
        self.word_embeddings = None
    
      def encode_words(self, words: list[str]) -> None:
        self.word_embeddings = self.model.encode(words, convert_to_tensor=True)
        self.words = words
    
      def calculate_similarity(self, text: str) -> list[float]:
        if self.word_embeddings is None:
          raise ValueError('Words must be encoded before calculating similarity.')
        text_embedding = self.model.encode(text, convert_to_tensor=True)
        similarities = util.cos_sim(text_embedding, self.word_embeddings)[
          0
        ].tolist()
        return similarities
    
      def add_similarity_scores_to_df(
        self, df: pd.DataFrame, text_column: str
      ) -> pd.DataFrame:
        if self.words is None:
          raise ValueError(
            'Words must be encoded before adding scores to the DataFrame.'
          )
        similarity_columns = ['word_' + word for word in self.words]
        df[similarity_columns] = df[text_column].apply(
          lambda text: pd.Series(self.calculate_similarity(text))
        )
        return df
    
    
    def main():
      data = {'text': ['I want to eat', 'The house is big', 'I need to sleep']}
      df = pd.DataFrame(data)
      words = ['food', 'house', 'sleep', 'drink', 'run']
      calculator = SemanticSimilarityCalculator()
      calculator.encode_words(words)
      df_with_scores = calculator.add_similarity_scores_to_df(
        df, text_column='text'
      )
      print(df_with_scores)
    
    
    if __name__ == '__main__':
      main()
    

    Output:

                   text  word_food  word_house  word_sleep  word_drink  word_run
    0     I want to eat   0.592410    0.215032    0.254065    0.370329  0.259350
    1  The house is big   0.243262    0.672110    0.170785    0.213780  0.119716
    2   I need to sleep   0.253703    0.222462    0.725105    0.358372  0.303838