Search code examples
pytorchbert-language-model

Failing to create a transformer from scratch and push it on cuda (use it on Gpu)


In order to learn Pytorch and understand how transformers works i tried to implement from scratch (inspired from HuggingFace book) a transformer classifier:

from transformers import AutoTokenizer,DataCollatorWithPadding
from bertviz.transformers_neuron_view import BertModel
from transformers import AutoConfig
import torch
from torch import nn
import torch.nn.functional as F
from math import sqrt

model_ckpt = "bert-base-uncased"

# config = AutoConfig.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
# model = BertModel.from_pretrained(model_ckpt)
config = {
    "vocab_size": 30522,
    "hidden_size": 768,
    "max_position_embeddings": 512,
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "hidden_dropout_prob": 0.1,
    "num_labels": 6,
    "intermediate_size": 3072,
}
config = dotdict(config)


class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

config = dotdict(config)

class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocab_size, 
                                             config.hidden_size)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                                config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout()

    def forward(self, input_ids):
        # Create position IDs for input sequence
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0)
        # Create token and position embeddings
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        # Combine token and position embeddings
        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

def scaled_dot_product_attention(query, key, value):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
    weights = F.softmax(scores, dim=-1)
    return torch.bmm(weights, value)

class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, hidden_state):
        attn_outputs = scaled_dot_product_attention(
            self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))
        return attn_outputs

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
        x = self.output_linear(x)
        return x

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x
        
class TransformerEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)

    def forward(self, x):
        # Apply layer normalization and then copy input into query, key, value
        hidden_state = self.layer_norm_1(x)
        # Apply attention with a skip connection
        x = x + self.attention(hidden_state)
        # Apply feed-forward layer with a skip connection
        x = x + self.feed_forward(self.layer_norm_2(x))
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.layers = nn.ModuleList([TransformerEncoderLayer(config) 
                                     for _ in range(config.num_hidden_layers)])

    def forward(self, x):
        x = self.embeddings(x)
        for layer in self.layers:
            x = layer(x)
        return x
        
#Adding a Classification Head
class TransformerForSequenceClassification(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.encoder = TransformerEncoder(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        
    def forward(self, x):
        x = self.encoder(x)[:, 0, :] # select hidden state of [CLS] token
        x = self.dropout(x)
        x = self.classifier(x)
        return x

config.num_labels = 6
encoder_classifier = TransformerForSequenceClassification(config)

Then i preprocess data:

from datasets import load_dataset
import pandas as pd
emotions = load_dataset("emotion")

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

tokenized_datasets = emotions_encoded.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=8)

from torch.optim import AdamW
optimizer = AdamW(encoder_classifier.parameters(), lr=5e-5)

loss_fn = nn.CrossEntropyLoss()

from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

from tqdm.auto import tqdm
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
encoder_classifier.to(device)
#next(encoder_classifier.parameters()).is_cuda
progress_bar = tqdm(range(num_training_steps))

encoder_classifier.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        import pdb;pdb.set_trace()
        outputs = encoder_classifier(batch["input_ids"])
        loss = loss_fn(outputs, batch["labels"])
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

I finally got the error: "RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)"

I am not sure that pushing my custom model of bert on device (cuda) works. Do you have an idea why and how to correct the code to make it works on gpu.

Edit:

the error is raised at this line: ---> 25 outputs = encoder_classifier(batch["input_ids"])

and i check that arguments are on cuda as followed:

ipdb>  !next(encoder_classifier.parameters()).is_cuda
True
ipdb>  batch["input_ids"].device
device(type='cuda', index=0)
ipdb>   batch["labels"].device
device(type='cuda', index=0)

Solution

  • I had to push position_ids to cuda .... I feel stupid now ::)

    position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0).to(device)