nlp huggingface-transformers huggingface-tokenizers peft

How to resolve ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']

I am working on sentiment analysis using the IMDb dataset and a GPT-2-based model. This is a toy project to understand PEFT and LORA as well as get some experience with the Huggingface library.

This is what I've tried out:

from datasets import load_dataset

splits = ["train", "test"]
ds = {split: ds for split, ds in zip(splits, load_dataset("imdb", split=splits))}

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
# GPT-2 Tokenizer doesn't have a padding token.
tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    """Preprocess the imdb dataset by returning tokenized examples."""
    tokens = tokenizer(examples['text'],padding='max_length',truncation=True)
    return tokens


tokenized_ds = {}
for split in splits:
    tokenized_ds[split] = ds[split].map(preprocess_function, batched=True)


model2 = AutoModelForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=2,
    id2label={0: "NEGATIVE", 1: "POSITIVE"},  # For converting predictions to strings
    label2id={"NEGATIVE": 0, "POSITIVE":1},
)
model2.config.pad_token_id = model.config.eos_token_id

from peft import LoraConfig
from peft import get_peft_model
lora_config = LoraConfig("lora_gpt2", fan_in_fan_out=True,)
lora_model = get_peft_model(model2, lora_config)

trainer_lora = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./data/sentiment_analysis2",
        learning_rate=2e-3,
        # Reduce the batch size if you don't have enough memory
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=5,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer_lora.train()

When I run this code, Im getting the following error and have had some difficulty debugging the issue:

File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:3018, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
   3016 # The model's main input name, usually `input_ids`, has be passed for padding
   3017 if self.model_input_names[0] not in encoded_inputs:
-> 3018     raise ValueError(
   3019         "You should supply an encoding or a list of encodings to this method "
   3020         f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
   3021     )
   3023 required_input = encoded_inputs[self.model_input_names[0]]
   3025 if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']

I'm not sure how to resolve this and haven't been able to find many examples for this online and was hoping the SO community could help out.

Solution

Turns out the LoRA model changes the name of the column expected from label, to labels.

In order to fix it, you need

# LoRA takes in "labels", not "label" so we need to rename the 
# training and testing sets
train_lora = tokenized_ds['train'].rename_column('label', 'labels')
test_lora = tokenized_ds['test'].rename_column('label', 'labels')

Also needed was the TaskType in the config:

LoraConfig(
r=8, 
lora_alpha=32,
target_modules=['c_attn', 'c_proj'],
lora_dropout=0.1,
bias="none",
fan_in_fan_out=True,
task_type=TaskType.SEQ_CLS
)