Search code examples
pythonnlphuggingface-transformerssentiment-analysislarge-language-model

How to generate sentiment scores using predefined aspects with deberta-v3-base-absa-v1.1 Huggingface model?


I have a dataframe , where there is text in 1st column and predefine aspect in another column however there is no aspects defined for few text ,for example row 2.

data = {
    'text': [
        "The camera quality of this phone is amazing.",
        "The belt is poor quality",
        "The battery life could be improved.",
        "The display is sharp and vibrant.",
        "The customer service was disappointing."
    ],
    'aspects': [
        ["camera", "phone"],
        [],
        ["battery", "life"],
        ["display"],
        ["customer service"]
    ]
}

df = pd.DataFrame(data)

I want to generate two things

  1. using pre define aspect for the text, generate sentiment score
  2. using text generate aspect and also the sentiment score from the package

Note: This package yangheng/deberta-v3-base-absa-v1.1

1)generate sentiment score based on predefine aspects

2)generate both aspect and it's respective sentiments

Note Row 2 does not have predefine aspect

I tried and getting error

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd

# Load the ABSA model and tokenizer
model_name = "yangheng/deberta-v3-base-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)




# Generate aspects and sentiments
aspects = []
sentiments = []

for index, row in df.iterrows():
    text = row['text']
    row_aspects = row['aspects']
    
    aspect_sentiments = []
    
    for aspect in row_aspects:
        inputs = tokenizer(text, aspect, return_tensors="pt")
        
        with torch.inference_mode():
            outputs = model(**inputs)
        
        predicted_sentiment = torch.argmax(outputs.logits).item()
        sentiment_label = model.config.id2label[predicted_sentiment]
        
        aspect_sentiments.append(f"{aspect}: {sentiment_label}")
    
    aspects.append(row_aspects)
    sentiments.append(aspect_sentiments)

# Add the generated aspects and sentiments to the DataFrame
df['generated_aspects'] = aspects
df['generated_sentiments'] = sentiments

# Print the updated DataFrame
print(df)



generic example to use the package

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "yangheng/deberta-v3-base-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

aspects = ["food", "service"]
text = "The food was great but the service was terrible."
sentiment_aspect = {}
for aspect in aspects:
  inputs = tokenizer(text, aspect, return_tensors="pt")

  with torch.inference_mode():
    outputs = model(**inputs)

  scores = F.softmax(outputs.logits[0], dim=-1)
  label_id = torch.argmax(scores).item()
  sentiment_aspect[aspect] = (model.config.id2label[label_id], scores[label_id].item())

print(sentiment_aspect)

Desired Output

enter image description here


Solution

  • Specific to the yangheng/deberta-v3-base-absa-v1.1 model this is the usage and you have to loop through the model one time per aspect:

    # Load the ABSA model and tokenizer
    model_name = "yangheng/deberta-v3-base-absa-v1.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
    
    
    for aspect in ['camera', 'phone']:
       print(aspect, classifier('The camera quality of this phone is amazing.',  text_pair=aspect))
    

    [out]:

    camera [{'label': 'Positive', 'score': 0.9967294931411743}]
    phone [{'label': 'Neutral', 'score': 0.9472787380218506}]
    

    To get the zero-shot classification scores in general, try using pipeline:

    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    from transformers import pipeline
    
    
    # Load the ABSA model and tokenizer
    model_name = "yangheng/deberta-v3-base-absa-v1.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    
    
    pipe = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)
    
    pipe("The camera quality of this phone is amazing.", candidate_labels=["camera", "phone"])
    

    [out]:

    {'sequence': 'The camera quality of this phone is amazing.',
     'labels': ['camera', 'phone'],
     'scores': [0.9036691784858704, 0.09633082151412964]}
    

    Depending on what "text generated aspect" means, perhaps it's keyword extraction, and if so, doing a search on https://huggingface.co/models?search=keyword, gives this as the top downloaded model, https://huggingface.co/yanekyuk/bert-uncased-keyword-extractor

    from transformers import AutoTokenizer, AutoModelForTokenClassification
    
    tokenizer2 = AutoTokenizer.from_pretrained("yanekyuk/bert-uncased-keyword-extractor")
    model2 = AutoModelForTokenClassification.from_pretrained("yanekyuk/bert-uncased-keyword-extractor")
    
    
    
    def extract_aspect(text):
        extractor = pipeline("ner", model=model2, tokenizer=tokenizer2)
        phrasesids = []
        for tag in extractor(text):
            if tag['entity'].startswith('B'):
                phrasesids.append([tag['start'], tag['end']])
            if tag['entity'].startswith('I'):
                phrasesids[-1][-1] = tag['end']
        phrases = [text[p[0]:p[1]] for p in phrasesids]
        return phrases
    
    text = "The camera quality of this phone is amazing."
    
    extract_aspect(text)
    

    [out]:

    camera
    

    Putting the extractor and classifier together:

    from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
    from transformers import pipeline
    
    
    # Load the ABSA model and tokenizer
    model_name = "yangheng/deberta-v3-base-absa-v1.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    
    classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)
    
    tokenizer2 = AutoTokenizer.from_pretrained("yanekyuk/bert-uncased-keyword-extractor")
    model2 = AutoModelForTokenClassification.from_pretrained("yanekyuk/bert-uncased-keyword-extractor")
    
    
    def extract_aspect(text):
        extractor = pipeline("ner", model=model2, tokenizer=tokenizer2)
        phrasesids = []
        for tag in extractor(text):
            if tag['entity'].startswith('B'):
                phrasesids.append([tag['start'], tag['end']])
            if tag['entity'].startswith('I'):
                phrasesids[-1][-1] = tag['end']
        phrases = [text[p[0]:p[1]] for p in phrasesids]
        return phrases
    
    text = "The camera quality of this phone is amazing."
    
    pipe(text, candidate_labels=extract_aspect(text))
    

    [out]:

    {'sequence': 'The camera quality of this phone is amazing.',
     'labels': ['camera'],
     'scores': [0.9983300566673279]}
    

    Q: But the extracted keywords is not "right" or doesn't match the pre-defined ones?

    A: No model is perfect and the model example above is a keyword extractor not a product aspect extractor. YMMV.

    Q: Why isn't the zero-shot classifier giving me negative / positive labels?

    A: The zero-shot classifier is labelling the data based on the extracted labels. Not a sentiment classifier.