I am working on sentiment analysis using the IMDb dataset and a GPT-2-based model. This is a toy project to understand PEFT and LORA as well as get some experience with the Huggingface library.
This is what I've tried out:
from datasets import load_dataset
splits = ["train", "test"]
ds = {split: ds for split, ds in zip(splits, load_dataset("imdb", split=splits))}
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# GPT-2 Tokenizer doesn't have a padding token.
tokenizer.pad_token = tokenizer.eos_token
def preprocess_function(examples):
"""Preprocess the imdb dataset by returning tokenized examples."""
tokens = tokenizer(examples['text'],padding='max_length',truncation=True)
return tokens
tokenized_ds = {}
for split in splits:
tokenized_ds[split] = ds[split].map(preprocess_function, batched=True)
model2 = AutoModelForSequenceClassification.from_pretrained(
"gpt2",
num_labels=2,
id2label={0: "NEGATIVE", 1: "POSITIVE"}, # For converting predictions to strings
label2id={"NEGATIVE": 0, "POSITIVE":1},
)
model2.config.pad_token_id = model.config.eos_token_id
from peft import LoraConfig
from peft import get_peft_model
lora_config = LoraConfig("lora_gpt2", fan_in_fan_out=True,)
lora_model = get_peft_model(model2, lora_config)
trainer_lora = Trainer(
model=lora_model,
args=TrainingArguments(
output_dir="./data/sentiment_analysis2",
learning_rate=2e-3,
# Reduce the batch size if you don't have enough memory
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=5,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
),
train_dataset=tokenized_ds["train"],
eval_dataset=tokenized_ds["test"],
tokenizer=tokenizer,
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
compute_metrics=compute_metrics,
)
trainer_lora.train()
When I run this code, Im getting the following error and have had some difficulty debugging the issue:
File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:3018, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
3016 # The model's main input name, usually `input_ids`, has be passed for padding
3017 if self.model_input_names[0] not in encoded_inputs:
-> 3018 raise ValueError(
3019 "You should supply an encoding or a list of encodings to this method "
3020 f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
3021 )
3023 required_input = encoded_inputs[self.model_input_names[0]]
3025 if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']
I'm not sure how to resolve this and haven't been able to find many examples for this online and was hoping the SO community could help out.
Turns out the LoRA model changes the name of the column expected from label, to labels.
In order to fix it, you need
# LoRA takes in "labels", not "label" so we need to rename the
# training and testing sets
train_lora = tokenized_ds['train'].rename_column('label', 'labels')
test_lora = tokenized_ds['test'].rename_column('label', 'labels')
Also needed was the TaskType in the config:
LoraConfig(
r=8,
lora_alpha=32,
target_modules=['c_attn', 'c_proj'],
lora_dropout=0.1,
bias="none",
fan_in_fan_out=True,
task_type=TaskType.SEQ_CLS
)