Search code examples
pythonmachine-learningpytorch

The training function is throwing an "index out of range in self" error


This is my code:

# Extract input and target sequences from data list
input_sequences = []
target_sequences = []
BATCH_SIZE = 64
data = read_csv('gpt-j-data.csv')

for query, rephrases in data:
    input_sequences.append(query)
    target_sequences.append(rephrases)

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Tokenize the input and target sequences
input_sequences = [tokenizer.encode(sequence, add_special_tokens=True) for sequence in input_sequences]
target_sequences = [tokenizer.encode(sequence, add_special_tokens=True) for sequence in target_sequences]

# Convert the input and target sequences to tensors
input_sequences = [torch.tensor(sequence) for sequence in input_sequences]
target_sequences = [torch.tensor(sequence) for sequence in target_sequences]

input_sequences = ensure_tensor_size(input_sequences, 4)
target_sequences = ensure_tensor_size(target_sequences, 4)

# Create a RephraseDataset object from the input and target sequences
dataset = RephraseDataset(input_sequences, target_sequences)

# Create a DataLoader for the dataset
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = RephraseGenerator(vocab_size=1000, embedding_dim=256, hidden_size=512, num_layers=2, dropout=0.2)

# Move the model to the device
model.to(device)

# Set the optimizer and loss function
optimizer = optim.AdamW(model.parameters())
loss_fn = nn.CrossEntropyLoss()

train(model, dataloader, optimizer, device)

And this is my train function:

# Training loop
def train(model, data_loader, optimizer, device):
    model.train()
    epoch_loss = 0
    for input_sequence, target_sequence in data_loader:
        input_sequence = input_sequence.to(device)
        target_sequence = target_sequence.to(device)
        optimizer.zero_grad()
        predictions = model(input_sequence, target_sequence)
        loss = rephrase_loss(predictions, target_sequence)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

This is my ensure_tensor_size() function:

def ensure_tensor_size(tensor_list, size):
    """Ensures that each tensor in the list has the given size.
    
    If a tensor has a different size, it is padded with zeros.
    
    Args:
        tensor_list: a list of tensors
        size: an integer representing the desired size of the tensors
    
    Returns:
        a new list of tensors with the same size
    """
    padded_tensor_list = []
    for tensor in tensor_list:
        if tensor.size(0) < size:
            tensor = F.pad(tensor, (0, size - tensor.size(0)), value=0)
        elif tensor.size(0) > size:
            tensor = tensor[:size]
        padded_tensor_list.append(tensor)
    return padded_tensor_list

This is the error that I'm getting:

IndexError                                Traceback (most recent call last)
<ipython-input-5-1e843ae1e696> in <module>
    233 loss_fn = nn.CrossEntropyLoss()
    234 
--> 235 train(model, dataloader, optimizer, device)

5 frames
/usr/local/lib/python3.8/dist-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   2208         # remove once script supports set_grad_enabled
   2209         _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2210     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
   2211 
   2212 

IndexError: index out of range in self

This is what my data looks like:

[('Outdoor toys for kids',
  [" Kids' outdoor toys",
   ' Outdoor playthings for children',
   " Children's outdoor entertainment",
   ' Outdoor games for young ones ']),
 ('ducational toys for kids',
  [" Kids' educational toys",
   ' Educational playthings for children',
   " Children's educational entertainment",
   ' Educational games for young ones']),
 ('Dolls for girls',
  [" Girls' dolls",
   ' Dolls for little girls',
   ' Entertainment for young girls',
   ' Toys for young girls']),
 ("Kids' swings",
  [" Children's swings", ' Swings for kids', ' Swings for young ones', '']),
 ("Kids' footballs",
  [" Children's footballs",
   ' Footballs for kids',
   ' Footballs for young ones',
   ' Entertainment for young kids']),
 ('Pogo sticks for children',
  [" Kids' pogo sticks",
   " Children's pogo sticks",
   ' Pogo sticks for kids',
   ' Pogo sticks for young ones']),
 ('Holiday gifts',
  [' Gifts for the holidays',
   ' Gifts for special occasions',
   ' Gifts for celebrations',
   ' Gifts for loved ones']),
 ('Best holiday gifts',
  [' Top holiday gifts',
   ' Highly rated holiday gifts',
   ' Recommended holiday gifts',
   ' Best gifts for the holidays']),
 ('Popular holiday gifts',
  [' Best-selling holiday gifts',
   ' Most sought-after holiday gifts',
   ' Trending holiday gifts',
   ' Hot holiday gifts']),
 ('Holiday gifts for kids',
  [' Gifts for children during the holidays',
   ' Gifts for young ones during the holidays',
   ' Gifts for little ones during the holidays',
   ' Gifts for minors during the holidays']),
 ('Holiday gifts for men',
  [' Gifts for men during the holidays',
   ' Gifts for him during the holidays',
   ' Gifts for fathers during the holidays',
   ' Gifts for husbands during the holidays']),
 ('Holiday gifts for women',
  [' Gifts for women during the holidays',
   ' Gifts for her during the holidays',
   ' Gifts for mothers during the holidays',
   ' Gifts for wives during the holidays']),
 ('Holiday gifts for teens',
  [' Gifts for teenagers during the holidays',
   ' Gifts for adolescents during the holidays',
   ' Gifts for young adults during the holidays',
   ' Gifts for older kids during the holidays']),
 ('Holiday gifts for parents',
  [' Gifts for parents during the holidays',
   ' Gifts for mom and dad during the holidays',
   ' Gifts for caregivers during the holidays',
   ' Gifts for adults during the holidays']),
 ('Holiday gifts for grandparents',
  [' Gifts for grandparents during the holidays',
   ' Gifts for grandpa and grandma during the holidays',
   ' Gifts for senior citizens during the holidays',
   ' Gifts for older adults during the holidays']),
 ('Holiday gifts for friends',
  [' Gifts for friends during the holidays',
   ' Gifts for close friends during the holidays',
   ' Gifts for companions during the holidays',
   ' Gifts for peers during the holidays']),
 ('Holiday gifts for coworkers',
  [' Gifts for coworkers during the holidays',
   ' Gifts for colleagues during the holidays',
   ' Gifts for associates during the holidays',
   ' Gifts for professionals during the holidays']),
 ('Holiday gifts for pets',
  [' Gifts for pets during the holidays',
   ' Gifts for dogs during the holidays',
   ' Gifts for cats during the holidays',
   ' Gifts for animals during the holidays']),
 ('Holiday gifts for gamers',
  [' Gifts for gamers during the holidays',
   ' Gifts for video game enthusiasts during the holidays',
   ' Gifts for console gamers during the holidays',
   ' Gifts for PC gamers during the holidays']),
 ('Holiday gifts for hikers',
  [' Gifts for hikers during the holidays',
   ' Gifts for outdoor enthusiasts during the holidays',
   ' Gifts for walkers during the holidays',
   ' Gifts for nature lovers during the holidays']),
 ('Holiday gifts for book lovers',
  [' Gifts for book lovers during the holidays',
   ' Gifts for readers during the holidays',
   ' Gifts for bibliophiles during the holidays',
   ' Gifts for literature enthusiasts during the holidays']),
 ('Holiday gifts for foodies',
  [' Gifts for foodies during the holidays',
   ' Gifts for gourmet cooks during the holidays',
   ' Gifts for culinary enthusiasts during the holidays',
   ' Gifts for epicures during the holidays']),
 ('Holiday gifts for knitters and crocheters',
  [' Gifts for knitters and crocheters during the holidays',
   ' Gifts for fiber artists during the holidays',
   ' Gifts for yarn enthusiasts during the holidays',
   ' Gifts for needlework enthusiasts during the holidays']),
 ('Holiday gifts for sewers and quilters',
  [' Gifts for sewers and quilters during the holidays',
   ' Gifts for needleworkers during the holidays',
   ' Gifts for seamstresses during the holidays',
   ' Gifts for tailors during the holidays']),
 ('Holiday gifts for DIYers',
  [' Gifts for DIYers during the holidays',
   ' Gifts for home improvement enthusiasts during the holidays',
   ' Gifts for handymen and handywomen during the holidays',
   ' Gifts for crafters during the holidays']),
 ('Holiday gifts for mechanics',
  [' Gifts for mechanics during the holidays',
   ' Gifts for auto mechanics during the holidays',
   ' Gifts for mechanic enthusiasts during the holidays',
   ' Gifts for technicians during the holidays']),
 ('Holiday gifts for handymen and handywomen',
  [' Gifts for handymen and handywomen during the holidays',
   ' Gifts for DIY enthusiasts during the holidays',
   ' Gifts for home improvement experts during the holidays',
   ' Gifts for craftspeople during the holidays']),
 ('Luggage sets',
  [' Best luggage',
   ' Travel luggage',
   ' Suitcase sets',
   ' Best luggage sets ']),
 ('Travel backpacks',
  [' Backpacks for travel',
   ' Best travel backpacks',
   ' Popular travel backpacks',
   ' Backpacks for vacation ']),
 ('Travel pillows',
  [' Best travel pillows',
   ' Top-rated travel pillows',
   ' Popular travel pillows',
   ' Best pillows for travel ']),
 ('Travel neck pillows',
  [' Best travel neck pillows',
   ' Best-selling travel neck pillows',
   ' Neck pillows for travel',
   ' Popular travel neck pillows '])]

Where am I going wrong?


Solution

  • Why the error occurs

    Your model (RephraseGenerator) has vocab_size=1000, but your tokenizer (GPT2Tokenizer) actually has a vocab size of 50257, so your tokenizer is creating tokens with index >= 1000 and your model is throwing an IndexError while trying to look up embeddings for those tokens.

    Here's a minimal reproduction of the same IndexError:

    from transformers import GPT2Tokenizer
    import torch
    import torch.nn as nn
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = nn.Embedding(1000, 32)
    # This line raises an IndexError
    model(torch.tensor(tokenizer.encode("Hello world", add_special_tokens=True)))
    

    How to fix in general

    Use the correct vocab_size (matching your tokenizer) when constructing your embedding layer:

    model = nn.Embedding(tokenizer.vocab_size, 32)
    # This line returns the embedding with no errors
    model(torch.tensor(tokenizer.encode("Hello world", add_special_tokens=True)))
    

    How to fix, in OP's codebase

    It looks like you need to replace vocab_size=1000 with vocab_size=tokenizer.vocab_size in the RephraseGenerator() call, to achieve the same effect.