This is my code:
# Extract input and target sequences from data list
input_sequences = []
target_sequences = []
BATCH_SIZE = 64
data = read_csv('gpt-j-data.csv')
for query, rephrases in data:
input_sequences.append(query)
target_sequences.append(rephrases)
# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Tokenize the input and target sequences
input_sequences = [tokenizer.encode(sequence, add_special_tokens=True) for sequence in input_sequences]
target_sequences = [tokenizer.encode(sequence, add_special_tokens=True) for sequence in target_sequences]
# Convert the input and target sequences to tensors
input_sequences = [torch.tensor(sequence) for sequence in input_sequences]
target_sequences = [torch.tensor(sequence) for sequence in target_sequences]
input_sequences = ensure_tensor_size(input_sequences, 4)
target_sequences = ensure_tensor_size(target_sequences, 4)
# Create a RephraseDataset object from the input and target sequences
dataset = RephraseDataset(input_sequences, target_sequences)
# Create a DataLoader for the dataset
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RephraseGenerator(vocab_size=1000, embedding_dim=256, hidden_size=512, num_layers=2, dropout=0.2)
# Move the model to the device
model.to(device)
# Set the optimizer and loss function
optimizer = optim.AdamW(model.parameters())
loss_fn = nn.CrossEntropyLoss()
train(model, dataloader, optimizer, device)
And this is my train function:
# Training loop
def train(model, data_loader, optimizer, device):
model.train()
epoch_loss = 0
for input_sequence, target_sequence in data_loader:
input_sequence = input_sequence.to(device)
target_sequence = target_sequence.to(device)
optimizer.zero_grad()
predictions = model(input_sequence, target_sequence)
loss = rephrase_loss(predictions, target_sequence)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
return epoch_loss / len(data_loader)
This is my ensure_tensor_size() function:
def ensure_tensor_size(tensor_list, size):
"""Ensures that each tensor in the list has the given size.
If a tensor has a different size, it is padded with zeros.
Args:
tensor_list: a list of tensors
size: an integer representing the desired size of the tensors
Returns:
a new list of tensors with the same size
"""
padded_tensor_list = []
for tensor in tensor_list:
if tensor.size(0) < size:
tensor = F.pad(tensor, (0, size - tensor.size(0)), value=0)
elif tensor.size(0) > size:
tensor = tensor[:size]
padded_tensor_list.append(tensor)
return padded_tensor_list
This is the error that I'm getting:
IndexError Traceback (most recent call last)
<ipython-input-5-1e843ae1e696> in <module>
233 loss_fn = nn.CrossEntropyLoss()
234
--> 235 train(model, dataloader, optimizer, device)
5 frames
/usr/local/lib/python3.8/dist-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2208 # remove once script supports set_grad_enabled
2209 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2210 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
2211
2212
IndexError: index out of range in self
This is what my data looks like:
[('Outdoor toys for kids',
[" Kids' outdoor toys",
' Outdoor playthings for children',
" Children's outdoor entertainment",
' Outdoor games for young ones ']),
('ducational toys for kids',
[" Kids' educational toys",
' Educational playthings for children',
" Children's educational entertainment",
' Educational games for young ones']),
('Dolls for girls',
[" Girls' dolls",
' Dolls for little girls',
' Entertainment for young girls',
' Toys for young girls']),
("Kids' swings",
[" Children's swings", ' Swings for kids', ' Swings for young ones', '']),
("Kids' footballs",
[" Children's footballs",
' Footballs for kids',
' Footballs for young ones',
' Entertainment for young kids']),
('Pogo sticks for children',
[" Kids' pogo sticks",
" Children's pogo sticks",
' Pogo sticks for kids',
' Pogo sticks for young ones']),
('Holiday gifts',
[' Gifts for the holidays',
' Gifts for special occasions',
' Gifts for celebrations',
' Gifts for loved ones']),
('Best holiday gifts',
[' Top holiday gifts',
' Highly rated holiday gifts',
' Recommended holiday gifts',
' Best gifts for the holidays']),
('Popular holiday gifts',
[' Best-selling holiday gifts',
' Most sought-after holiday gifts',
' Trending holiday gifts',
' Hot holiday gifts']),
('Holiday gifts for kids',
[' Gifts for children during the holidays',
' Gifts for young ones during the holidays',
' Gifts for little ones during the holidays',
' Gifts for minors during the holidays']),
('Holiday gifts for men',
[' Gifts for men during the holidays',
' Gifts for him during the holidays',
' Gifts for fathers during the holidays',
' Gifts for husbands during the holidays']),
('Holiday gifts for women',
[' Gifts for women during the holidays',
' Gifts for her during the holidays',
' Gifts for mothers during the holidays',
' Gifts for wives during the holidays']),
('Holiday gifts for teens',
[' Gifts for teenagers during the holidays',
' Gifts for adolescents during the holidays',
' Gifts for young adults during the holidays',
' Gifts for older kids during the holidays']),
('Holiday gifts for parents',
[' Gifts for parents during the holidays',
' Gifts for mom and dad during the holidays',
' Gifts for caregivers during the holidays',
' Gifts for adults during the holidays']),
('Holiday gifts for grandparents',
[' Gifts for grandparents during the holidays',
' Gifts for grandpa and grandma during the holidays',
' Gifts for senior citizens during the holidays',
' Gifts for older adults during the holidays']),
('Holiday gifts for friends',
[' Gifts for friends during the holidays',
' Gifts for close friends during the holidays',
' Gifts for companions during the holidays',
' Gifts for peers during the holidays']),
('Holiday gifts for coworkers',
[' Gifts for coworkers during the holidays',
' Gifts for colleagues during the holidays',
' Gifts for associates during the holidays',
' Gifts for professionals during the holidays']),
('Holiday gifts for pets',
[' Gifts for pets during the holidays',
' Gifts for dogs during the holidays',
' Gifts for cats during the holidays',
' Gifts for animals during the holidays']),
('Holiday gifts for gamers',
[' Gifts for gamers during the holidays',
' Gifts for video game enthusiasts during the holidays',
' Gifts for console gamers during the holidays',
' Gifts for PC gamers during the holidays']),
('Holiday gifts for hikers',
[' Gifts for hikers during the holidays',
' Gifts for outdoor enthusiasts during the holidays',
' Gifts for walkers during the holidays',
' Gifts for nature lovers during the holidays']),
('Holiday gifts for book lovers',
[' Gifts for book lovers during the holidays',
' Gifts for readers during the holidays',
' Gifts for bibliophiles during the holidays',
' Gifts for literature enthusiasts during the holidays']),
('Holiday gifts for foodies',
[' Gifts for foodies during the holidays',
' Gifts for gourmet cooks during the holidays',
' Gifts for culinary enthusiasts during the holidays',
' Gifts for epicures during the holidays']),
('Holiday gifts for knitters and crocheters',
[' Gifts for knitters and crocheters during the holidays',
' Gifts for fiber artists during the holidays',
' Gifts for yarn enthusiasts during the holidays',
' Gifts for needlework enthusiasts during the holidays']),
('Holiday gifts for sewers and quilters',
[' Gifts for sewers and quilters during the holidays',
' Gifts for needleworkers during the holidays',
' Gifts for seamstresses during the holidays',
' Gifts for tailors during the holidays']),
('Holiday gifts for DIYers',
[' Gifts for DIYers during the holidays',
' Gifts for home improvement enthusiasts during the holidays',
' Gifts for handymen and handywomen during the holidays',
' Gifts for crafters during the holidays']),
('Holiday gifts for mechanics',
[' Gifts for mechanics during the holidays',
' Gifts for auto mechanics during the holidays',
' Gifts for mechanic enthusiasts during the holidays',
' Gifts for technicians during the holidays']),
('Holiday gifts for handymen and handywomen',
[' Gifts for handymen and handywomen during the holidays',
' Gifts for DIY enthusiasts during the holidays',
' Gifts for home improvement experts during the holidays',
' Gifts for craftspeople during the holidays']),
('Luggage sets',
[' Best luggage',
' Travel luggage',
' Suitcase sets',
' Best luggage sets ']),
('Travel backpacks',
[' Backpacks for travel',
' Best travel backpacks',
' Popular travel backpacks',
' Backpacks for vacation ']),
('Travel pillows',
[' Best travel pillows',
' Top-rated travel pillows',
' Popular travel pillows',
' Best pillows for travel ']),
('Travel neck pillows',
[' Best travel neck pillows',
' Best-selling travel neck pillows',
' Neck pillows for travel',
' Popular travel neck pillows '])]
Where am I going wrong?
Why the error occurs
Your model (RephraseGenerator
) has vocab_size=1000
, but your tokenizer (GPT2Tokenizer
) actually has a vocab size of 50257, so your tokenizer is creating tokens with index >= 1000 and your model is throwing an IndexError
while trying to look up embeddings for those tokens.
Here's a minimal reproduction of the same IndexError
:
from transformers import GPT2Tokenizer
import torch
import torch.nn as nn
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = nn.Embedding(1000, 32)
# This line raises an IndexError
model(torch.tensor(tokenizer.encode("Hello world", add_special_tokens=True)))
How to fix in general
Use the correct vocab_size
(matching your tokenizer) when constructing your embedding layer:
model = nn.Embedding(tokenizer.vocab_size, 32)
# This line returns the embedding with no errors
model(torch.tensor(tokenizer.encode("Hello world", add_special_tokens=True)))
How to fix, in OP's codebase
It looks like you need to replace vocab_size=1000
with vocab_size=tokenizer.vocab_size
in the RephraseGenerator()
call, to achieve the same effect.