In order to learn Pytorch and understand how transformers works i tried to implement from scratch (inspired from HuggingFace book) a transformer classifier:
from transformers import AutoTokenizer,DataCollatorWithPadding
from bertviz.transformers_neuron_view import BertModel
from transformers import AutoConfig
import torch
from torch import nn
import torch.nn.functional as F
from math import sqrt
model_ckpt = "bert-base-uncased"
# config = AutoConfig.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
# model = BertModel.from_pretrained(model_ckpt)
config = {
"vocab_size": 30522,
"hidden_size": 768,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"hidden_dropout_prob": 0.1,
"num_labels": 6,
"intermediate_size": 3072,
}
config = dotdict(config)
class dotdict(dict):
"""dot.notation access to dictionary attributes"""
__getattr__ = dict.get
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
config = dotdict(config)
class Embeddings(nn.Module):
def __init__(self, config):
super().__init__()
self.token_embeddings = nn.Embedding(config.vocab_size,
config.hidden_size)
self.position_embeddings = nn.Embedding(config.max_position_embeddings,
config.hidden_size)
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-12)
self.dropout = nn.Dropout()
def forward(self, input_ids):
# Create position IDs for input sequence
seq_length = input_ids.size(1)
position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0)
# Create token and position embeddings
token_embeddings = self.token_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
# Combine token and position embeddings
embeddings = token_embeddings + position_embeddings
embeddings = self.layer_norm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
def scaled_dot_product_attention(query, key, value):
dim_k = query.size(-1)
scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
weights = F.softmax(scores, dim=-1)
return torch.bmm(weights, value)
class AttentionHead(nn.Module):
def __init__(self, embed_dim, head_dim):
super().__init__()
self.q = nn.Linear(embed_dim, head_dim)
self.k = nn.Linear(embed_dim, head_dim)
self.v = nn.Linear(embed_dim, head_dim)
def forward(self, hidden_state):
attn_outputs = scaled_dot_product_attention(
self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))
return attn_outputs
class MultiHeadAttention(nn.Module):
def __init__(self, config):
super().__init__()
embed_dim = config.hidden_size
num_heads = config.num_attention_heads
head_dim = embed_dim // num_heads
self.heads = nn.ModuleList(
[AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
)
self.output_linear = nn.Linear(embed_dim, embed_dim)
def forward(self, hidden_state):
x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
x = self.output_linear(x)
return x
class FeedForward(nn.Module):
def __init__(self, config):
super().__init__()
self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
self.gelu = nn.GELU()
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, x):
x = self.linear_1(x)
x = self.gelu(x)
x = self.linear_2(x)
x = self.dropout(x)
return x
class TransformerEncoderLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
self.attention = MultiHeadAttention(config)
self.feed_forward = FeedForward(config)
def forward(self, x):
# Apply layer normalization and then copy input into query, key, value
hidden_state = self.layer_norm_1(x)
# Apply attention with a skip connection
x = x + self.attention(hidden_state)
# Apply feed-forward layer with a skip connection
x = x + self.feed_forward(self.layer_norm_2(x))
return x
class TransformerEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.embeddings = Embeddings(config)
self.layers = nn.ModuleList([TransformerEncoderLayer(config)
for _ in range(config.num_hidden_layers)])
def forward(self, x):
x = self.embeddings(x)
for layer in self.layers:
x = layer(x)
return x
#Adding a Classification Head
class TransformerForSequenceClassification(nn.Module):
def __init__(self, config):
super().__init__()
self.encoder = TransformerEncoder(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, x):
x = self.encoder(x)[:, 0, :] # select hidden state of [CLS] token
x = self.dropout(x)
x = self.classifier(x)
return x
config.num_labels = 6
encoder_classifier = TransformerForSequenceClassification(config)
Then i preprocess data:
from datasets import load_dataset
import pandas as pd
emotions = load_dataset("emotion")
def tokenize(batch):
return tokenizer(batch["text"], padding=True, truncation=True)
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
tokenized_datasets = emotions_encoded.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=8)
from torch.optim import AdamW
optimizer = AdamW(encoder_classifier.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()
from transformers import get_scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
name="linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps,
)
from tqdm.auto import tqdm
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
encoder_classifier.to(device)
#next(encoder_classifier.parameters()).is_cuda
progress_bar = tqdm(range(num_training_steps))
encoder_classifier.train()
for epoch in range(num_epochs):
for batch in train_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
import pdb;pdb.set_trace()
outputs = encoder_classifier(batch["input_ids"])
loss = loss_fn(outputs, batch["labels"])
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
I finally got the error: "RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)"
I am not sure that pushing my custom model of bert on device (cuda) works. Do you have an idea why and how to correct the code to make it works on gpu.
Edit:
the error is raised at this line: ---> 25 outputs = encoder_classifier(batch["input_ids"])
and i check that arguments are on cuda as followed:
ipdb> !next(encoder_classifier.parameters()).is_cuda
True
ipdb> batch["input_ids"].device
device(type='cuda', index=0)
ipdb> batch["labels"].device
device(type='cuda', index=0)
I had to push position_ids to cuda .... I feel stupid now ::)
position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0).to(device)