I am currently trying to replicate the article
https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f
to get an introduction to PyTorch and BERT.
I used some own sample corpus and corresponding tragets as practise, but the code throws the following:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-4-8577755f37de> in <module>()
201 LR = 1e-6
202
--> 203 trainer(model, df_train, df_val, LR, EPOCHS)
3 frames
<ipython-input-4-8577755f37de> in trainer(model, train_data, val_data, learning_rate, epochs)
162 output = model(input_id, mask)
163
--> 164 batch_loss = criterion(output, torch.max(train_label,1)[1])
165 total_loss_train += batch_loss.item()
166
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/loss.py in forward(self, input, target)
1150 return F.cross_entropy(input, target, weight=self.weight,
1151 ignore_index=self.ignore_index, reduction=self.reduction,
-> 1152 label_smoothing=self.label_smoothing)
1153
1154
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
2844 if size_average is not None or reduce is not None:
2845 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2846 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
2847
2848
IndexError: Target 32 is out of bounds.
The code is mostly identical to the one in the article, except of course the more extensive lable-dict.
Orginial:
labels = {'business':0,
'entertainment':1,
'sport':2,
'tech':3,
'politics':4
}
Mine:
labels =
{'Macroeconomics': 0,
'Microeconomics': 1,
'Labor Economics': 2,
'Subnational Fiscal Issues': 3,
'Econometrics': 4,
'International Economics': 5,
'Financial Economics': 6,
'Health, Education, and Welfare': 7,
'Public Economics': 8,
'Development and Growth': 9,
'Industrial Organization': 10,
'Other': 11,
'Environmental and Resource Economics': 12,
'History': 13,
'Regional and Urban Economics': 14,
'Development Economics': 15,
'Corporate Finance': 16,
'Children': 17,
'Labor Studies': 18,
'Economic Fluctuations and Growth': 19,
'Economics of Aging': 20,
'Economics of Education': 21,
'International Trade and Investment': 22,
'Asset Pricing': 23,
'Health Economics': 24,
'Law and Economics': 25,
'International Finance and Macroeconomics': 26,
'Monetary Economics': 27,
'Technical Working Papers': 28,
'Political Economy': 29,
'Development of the American Economy': 30,
'Health Care': 31,
'Productivity, Innovation, and Entrepreneurship': 32}
Code:
class Dataset(torch.utils.data.Dataset):
def __init__(self, df):
self.labels = torch.LongTensor([labels[label] for label in df["category"]])
self.texts = [tokenizer(text,
padding='max_length', max_length = 512, truncation=True,
return_tensors="pt") for text in df['text']]
def classes(self):
return self.labels
def __len__(self):
return len(self.labels)
def get_batch_labels(self, idx):
# Fetch a batch of labels
return np.array(self.labels[idx])
def get_batch_texts(self, idx):
# Fetch a batch of inputs
return self.texts[idx]
def __getitem__(self, idx):
batch_texts = self.get_batch_texts(idx)
batch_y = np.array(range(0,len(labels)))
return batch_texts, batch_y
#Splitting the sample into trainingset, validationset and testset (80,10,10)
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
[int(.8*len(df)), int(.9*len(df))])
print(len(df_train),len(df_val), len(df_test))
from torch import nn
class BertClassifier(nn.Module):
def __init__(self, dropout=0.5):
super(BertClassifier, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-cased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, 5)
self.relu = nn.ReLU()
def forward(self, input_id, mask):
_, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output)
return final_layer
from torch.optim import Adam
from tqdm import tqdm
def trainer(model, train_data, val_data, learning_rate, epochs):
train, val = Dataset(train_data), Dataset(val_data)
train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr= learning_rate)
if use_cuda:
model = model.cuda()
criterion = criterion.cuda()
for epoch_num in range(epochs):
total_acc_train = 0
total_loss_train = 0
for train_input, train_label in tqdm(train_dataloader):
train_label = train_label.to(device)
mask = train_input['attention_mask'].to(device)
input_id = train_input['input_ids'].squeeze(1).to(device)
output = model(input_id, mask)
batch_loss = criterion(output, torch.max(train_label,1)[1])
total_loss_train += batch_loss.item()
acc = (output.argmax(dim=1) == train_label).sum().item()
total_acc_train += acc
model.zero_grad()
batch_loss.backward()
optimizer.step()
total_acc_val = 0
total_loss_val = 0
with torch.no_grad():
for val_input, val_label in val_dataloader:
val_label = val_label.to(device)
mask = val_input['attention_mask'].to(device)
input_id = val_input['input_ids'].squeeze(1).to(device)
output = model(input_id, mask)
batch_loss = criterion(output, val_label)
total_loss_val += batch_loss.item()
acc = (output.argmax(dim=1) == val_label).sum().item()
total_acc_val += acc
print(
f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
| Train Accuracy: {total_acc_train / len(train_data): .3f} \
| Val Loss: {total_loss_val / len(val_data): .3f} \
| Val Accuracy: {total_acc_val / len(val_data): .3f}')
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
trainer(model, df_train, df_val, LR, EPOCHS)
You're creating a list of length 33 in your __getitem__
call which is one more than the length of the labels list, hence the out of bounds error. In fact, you create the same list each time this method is called. You're supposed to fetch the associated y
with the X
found at idx
.
If you replace batch_y = np.array(range(...))
with batch_y = np.array(self.labels[idx])
, you'll fix your error. Indeed, this is already implemented in your get_batch_labels
method.