Search code examples
nlphuggingface-transformershuggingfacefine-tuning

IndexError: list index out of range, when trying to predict from the fine tuned model using Hugginface


i am trying to learn on how to fine tune a pretrained model and use it. this is my code

from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
import torch

# Define a simple accuracy metric
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == labels).mean()}

# Load the dataset
dataset = load_dataset("imdb", split='train[:1%]')
small_train_dataset = dataset.train_test_split(test_size=0.1)['train']
small_eval_dataset = dataset.train_test_split(test_size=0.1)['test']

# Load the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

small_train_dataset = small_train_dataset.map(tokenize_function, batched=True)
small_eval_dataset = small_eval_dataset.map(tokenize_function, batched=True)
small_train_dataset = small_train_dataset.rename_column("label", "labels")
small_eval_dataset = small_eval_dataset.rename_column("label", "labels")
small_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
small_eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Define training arguments
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
validation_results = trainer.evaluate()
print(validation_results)

now, i am trying to make a prediction on the fine tuned model, like this

inputs=tokenizer(dataset[0]['text'], padding="max_length", truncation=True,return_tensors="pt")
predictions = trainer.predict(test_dataset=inputs)

i am getting this error when i am trying to make a prediction,

IndexError Traceback (most recent call last) Cell In[8], line 7 3 inputs=tokenizer(dataset[0][‘text’], padding=“max_length”, truncation=True,return_tensors=“pt”) 6 # Make predictions ----> 7 predictions = trainer.predict(test_dataset=inputs)

File C:\Python311\Lib\site-packages\transformers\trainer.py:3305, in Trainer.predict(self, test_dataset, ignore_keys, metric_key_prefix) 3302 start_time = time.time() 3304 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop → 3305 output = eval_loop( 3306 test_dataloader, description=“Prediction”, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix 3307 ) 3308 total_batch_size = self.args.eval_batch_size * self.args.world_size 3309 if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:

File C:\Python311\Lib\site-packages\transformers\trainer.py:3408, in Trainer.evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix) 3406 observed_num_examples = 0 3407 # Main evaluation loop → 3408 for step, inputs in enumerate(dataloader): 3409 # Update the observed num examples 3410 observed_batch_size = find_batch_size(inputs) 3411 if observed_batch_size is not None:

File C:\Python311\Lib\site-packages\accelerate\data_loader.py:454, in DataLoaderShard.iter(self) 452 # We iterate one batch ahead to check when we are at the end 453 try: → 454 current_batch = next(dataloader_iter) 455 except StopIteration: 456 yield

File C:\Python311\Lib\site-packages\torch\utils\data\dataloader.py:631, in _BaseDataLoaderIter.next(self) 628 if self._sampler_iter is None: 629 # TODO(Bug in dataloader iterator found by mypy · Issue #76750 · pytorch/pytorch · GitHub) 630 self._reset() # type: ignore[call-arg] → 631 data = self._next_data() 632 self._num_yielded += 1 633 if self._dataset_kind == _DatasetKind.Iterable and 634 self._IterableDataset_len_called is not None and 635 self._num_yielded

self._IterableDataset_len_called:

File C:\Python311\Lib\site-packages\torch\utils\data\dataloader.py:675, in _SingleProcessDataLoaderIter._next_data(self) 673 def _next_data(self): 674 index = self._next_index() # may raise StopIteration → 675 data = self._dataset_fetcher.fetch(index) # may raise StopIteration 676 if self._pin_memory: 677 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

File C:\Python311\Lib\site-packages\torch\utils\data_utils\fetch.py:51, in _MapDatasetFetcher.fetch(self, possibly_batched_index) 49 data = self.dataset.getitems(possibly_batched_index) 50 else: —> 51 data = [self.dataset[idx] for idx in possibly_batched_index] 52 else: 53 data = self.dataset[possibly_batched_index]

File C:\Python311\Lib\site-packages\torch\utils\data_utils\fetch.py:51, in (.0) 49 data = self.dataset.getitems(possibly_batched_index) 50 else: —> 51 data = [self.dataset[idx] for idx in possibly_batched_index] 52 else: 53 data = self.dataset[possibly_batched_index]

File C:\Python311\Lib\site-packages\transformers\tokenization_utils_base.py:255, in BatchEncoding.getitem(self, item) 253 return self.data[item] 254 elif self._encodings is not None: → 255 return self._encodings[item] 256 elif isinstance(item, slice): 257 return {key: self.data[key][item] for key in self.data.keys()}

IndexError: list index out of range


Solution

  • The error you are encountering is because the trainer.predict method expects a dataset as input, but you are passing a single example that has been tokenized into tensors.

    To perform predictions on a single input, you need to prepare it similarly to how the dataset was prepared before training, and then use the model directly for prediction.

    Here's how you can modify your code to make predictions on a single input:

    1. Prepare the input correctly
    2. Use the model directly for prediction

    Here's the revised code:

    
    from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
    from datasets import load_dataset
    import numpy as np
    import torch
    
    # Define a simple accuracy metric
    def compute_metrics(p):
        predictions, labels = p
        preds = np.argmax(predictions, axis=1)
        return {"accuracy": (preds == labels).mean()}
    
    # Load the dataset
    dataset = load_dataset("imdb", split='train[:1%]')
    small_train_dataset = dataset.train_test_split(test_size=0.1)['train']
    small_eval_dataset = dataset.train_test_split(test_size=0.1)['test']
    
    # Load the tokenizer and model
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    
    # Tokenize the dataset
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True)
    
    small_train_dataset = small_train_dataset.map(tokenize_function, batched=True)
    small_eval_dataset = small_eval_dataset.map(tokenize_function, batched=True)
    small_train_dataset = small_train_dataset.rename_column("label", "labels")
    small_eval_dataset = small_eval_dataset.rename_column("label", "labels")
    small_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    small_eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir="test_trainer",
        evaluation_strategy="epoch",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01
    )
    
    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=small_train_dataset,
        eval_dataset=small_eval_dataset,
        compute_metrics=compute_metrics
    )
    
    # Train the model
    trainer.train()
    
    # Evaluate the model
    validation_results = trainer.evaluate()
    print(validation_results)
    
    # Make a prediction on a single input
    inputs = tokenizer(dataset[0]['text'], padding="max_length", truncation=True, return_tensors="pt")
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
    
    print(f"Predicted label: {predictions.item()}")