I've recently been trying to get hands on experience with the transformer library from Hugging Face. Since I'm an absolute noob when it comes to using Pytorch (and Deep Learning in general), I started with the introduction that can be found here.
Here is the code to install dependencies :
#!pip install transformers
!pip install transformers[sentencepiece] # includes transformers dependencies
!pip install datasets # datasets from huggingface hub
!pip install tqdm
Here's the code they propose to use to fine-tune BERT the MNPR dataset (used in the GLUE benchmark). This dataset includes two sentences per "sample", so in the tokenizer we have to use sentence1
and sentence2
.
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
# functions defining how the tokenizer works
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
# tokenizer will use dynamic padding (https://huggingface.co/course/chapter3/2?fw=pt)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# remove unecessary columns from data and format in torch tensors
tokenized_datasets = tokenized_datasets.remove_columns(
["sentence1", "sentence2", "idx"]
)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)
# loading model and training requirements
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps
)
print(num_training_steps)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
progress_bar = tqdm(range(num_training_steps))
# training loop:
model.train()
for epoch in range(num_epochs):
for batch in train_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
# assert 1==0
This works perfectly fine for me in Google Colab. I wanted to do the same thing with another dataset sst2
. The code I use is very similar to the one above. The only few lines of code that change are the lines to import the data and the tokenizer (we have one sentence per feature instead of two). I have double-checked and the tokenizer works fine. Here is my code :
# imports
import torch
from datasets import load_dataset # datasets from huggingface
# tokenization
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
# training
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
from tqdm.auto import tqdm
# Hyperparameters
batch_size = 8
learning_rate = 5e-5
num_epochs = 3
num_warmup_steps = 0
# load dataset and choosing checkpoint
raw_datasets = load_dataset("glue", "sst2")
checkpoint = "bert-base-uncased"
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# tokenization of dataset
def tokenize_function(example):
return tokenizer(example["sentence"], truncation=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
# setting DataLoader
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator
)
# import model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
# setup training loop
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = num_epochs * len(train_dataloader)
print(num_training_steps)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps
)
# chose device (GPU or CPU)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
for batch in train_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
for k,v in batch.items():
print(f"key={k},v.dtype={v.dtype}, type(v)={type(v)}")
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
And here's the error I get :
RuntimeError Traceback (most recent call last)
<ipython-input-11-7893d7715ac2> in <module>()
69 outputs = model(**batch)
70 loss = outputs.loss
---> 71 loss.backward()
72
73 optimizer.step()
1 frames
/usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
147 Variable._execution_engine.run_backward(
148 tensors, grad_tensors_, retain_graph, create_graph, inputs,
--> 149 allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
150
151
RuntimeError: Found dtype Long but expected Float
This seems like a very silly mistake, but like I said I'm an absolute pytorch noob and it's difficult for me to know where to start solving this issue. I have checked the type of the values in batch.items()
and in both cases, they are all torch.int64
(or torch.long
). I tried to change the attention_mask
and input_ids
values to torch.float32
, but I got the same error message.
Thanks in advance.
Python version and packages :
I found the source of the problem. The problem comes from the line
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
Since the dataset has 2 classes, the correct way of calling the model should be
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
With this modification, my code now works.