Search code examples
pythontensorflowartificial-intelligencehuggingface-transformersgpt-2

How to deal with stack expects each tensor to be equal size eror while fine tuning GPT-2 model?


I tried to fine tune a model with my personal information. So I can create a chat box where people can learn about me via chat gpt.

However, I got the error of

RuntimeError: stack expects each tensor to be equal size, but got [47] at entry 0 and [36] at entry 1

Because I have different length of input

Here are 2 of my sample input

What is the webisite of ABC company ? -> https://abcdef.org/

Do you know the website of ABC company ? -> It is https://abcdef.org/

Here is what I have tried so far

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader

class QADataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_length):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Add a padding token to the tokenizer
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, index):
        question = self.questions[index]
        answer = self.answers[index]

        input_text = f"Q: {question} A: {answer}"
        input_ids = self.tokenizer.encode(input_text, add_special_tokens=True, max_length=self.max_length, padding=True, truncation=True)

        if input_ids is None:
            return None

        input_ids = torch.tensor(input_ids, dtype=torch.long)
        print(f"Input ids size: {input_ids.size()}")
        return input_ids

# Set up the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Load the question and answer data
questions = ["What is the webisite of ABC company ?", "Do you know the website of ABC company ?"]
answers = ["https://abcdef.org/", "It is https://abcdef.org/"]

# Create the dataset and data loader
max_length = 64
dataset = QADataset(questions, answers, tokenizer, max_length=max_length)
data_loader = DataLoader(dataset, batch_size=8, shuffle=True)

# Fine-tune the model on the QA dataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(3):
    running_loss = 0.0
    for batch in data_loader:
        batch = batch.to(device)

        outputs = model(batch, labels=batch)
        loss, _ = outputs[:2]

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch + 1} loss: {running_loss / len(data_loader)}")

# Save the fine-tuned model
model.save_pretrained("qa_finetuned_gpt2")

I dont have a solid background of AI, it is more like reading references and try to implement it.


Solution

  • Yes seems like you didn't pad your inputs. The model expects the size to be the same for each text. So if it's too short, you pad it, and if it's too long, it should be truncated.

    See also

    Try changing how the tokenizer process the inputs:

    
    # Define the data loading class
    class MyDataset(Dataset):
        def __init__(self, data_path, tokenizer):
            self.data_path = data_path
            self.tokenizer = tokenizer
    
            with open(self.data_path, 'r') as f:
                self.data = f.read().split('\n')
    
        def __len__(self):
            return len(self.data)
    
        def __getitem__(self, index):
            text = self.data[index]
            inputs = self.tokenizer.encode(text, add_special_tokens=True, 
                truncation=True, max_length=80, padding="max_length")
            return torch.tensor(inputs)