I am trying to implement question answering model with a BERT transformer implemented by jugapuff. Link to the code: https://github.com/jugapuff/BERT-for-bAbi-task
After executing the main.py file which is written below as well, I m getting this error: "for tokens_tensor, segments_tensors, att_mask, pos_id, trg in data_loader: NameError: name 'data_loader' is not defined"
from dataloader import bAbi_Dataset
import torch
import torch.nn as nn
from model import model
from pytorch_transformers import AdamW
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
print("GPU:" + str(torch.cuda.get_device_name(0)))
my_model = model()
my_model.to(device)
optimizer = AdamW(my_model.parameters())
criterion = nn.NLLLoss()
EPOCHS = 10
for epoch in range(1, EPOCHS+1):
my_model.train()
train_loss = 0
length = 0
for tokens_tensor, segments_tensors, att_mask, pos_id, trg in data_loader:
output = my_model(tokens_tensor.to(device), segments_tensors.to(device), att_mask.to(device), pos_id.to(device))
loss = criterion(output, trg.to(device))
optimizer.zero_grad()
loss.backward()
optimizer.step()
length+=1
train_loss += loss.item()
if length % 10 == 0:
print("\t\t{:3}/25000 : {}".format(length, train_loss / length))
epoch_loss = train_loss / length
print("##################")
print("{} epoch Loss : {:.4f}".format(epoch, epoch_loss))
and data_loader.py is as
import os
import torch
import torch.utils.data as data
from pytorch_transformers import BertTokenizer
def _parse( file, only_supporting=False):
data, story = [], []
for line in file:
tid, text = line.rstrip('\n').split(' ', 1)
if tid == '1':
story = []
if text.endswith('.'):
story.append(text[:])
else:
query, answer, supporting = (x.strip() for x in text.split('\t'))
if only_supporting:
substory = [story[int(i) - 1] for i in supporting.split()]
else:
substory = [x for x in story if x]
data.append((substory, query[:-1], answer))
story.append("")
return data
def build_trg_dics(tenK=True, path="tasks_1-20_v1-2", train=True):
if tenK:
dirname = os.path.join(path, 'en-10k')
else:
dirname = os.path.join(path, 'en')
for (dirpath, dirnames, filenames) in os.walk(dirname):
filenames = filenames
if train:
filenames = [filename for filename in filenames if "train.txt" in filename]
else:
filenames = [filename for filename in filenames if "test.txt" in filename]
temp = []
for filename in filenames:
f = open(os.path.join(dirname, filename), 'r')
parsed =_parse(f)
temp.extend([d[2] for d in parsed])
temp = set(temp)
trg_word2id = {word:i for i, word in enumerate(temp)}
trg_id2word = {i:word for i, word in enumerate(temp)}
return trg_word2id, trg_id2word
class bAbi_Dataset(data.Dataset):
def __init__(self, trg_word2id, tenK=True, path = "tasks_1-20_v1-2", train=True):
# joint is Default
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
if tenK:
dirname = os.path.join(path, 'en-10k')
else:
dirname = os.path.join(path, 'en')
for (dirpath, dirnames, filenames) in os.walk(dirname):
filenames = filenames
if train:
filenames = [filename for filename in filenames if "train.txt" in filename]
else:
filenames = [filename for filename in filenames if "test.txt" in filename]
self.src = []
self.trg = []
for filename in filenames:
f = open(os.path.join(dirname, filename), 'r')
parsed = _parse(f)
self.src.extend([d[:2] for d in parsed])
self.trg.extend([trg_word2id[d[2]] for d in parsed])
self.trg = torch.tensor(self.trg)
def __getitem__(self, index):
src_seq = self.src[index]
trg = self.trg[index]
src_seq, seg_seq, att_mask, pos_id = self.preprocess_sequence(src_seq)
return src_seq, seg_seq, att_mask, pos_id, trg
def __len__(self):
return len(self.trg)
def preprocess_sequence(self, seq):
text = ["[CLS]"] + list(seq[0]) + ["[SEP]"] + [seq[1]] + ["[SEP]"]
tokenized_text = self.tokenizer.tokenize(" ".join(text))
indexed_text = self.tokenizer.convert_tokens_to_ids(tokenized_text)
where_is_sep = indexed_text.index(102) + 1
segment_ids = [0 ]* (where_is_sep) + [1] * (len(indexed_text)- where_is_sep)
attention_mask = [1] *len(indexed_text)
pos_id = [i for i in range(len(indexed_text))]
return torch.tensor(indexed_text), torch.tensor(segment_ids), torch.tensor(attention_mask), torch.tensor(pos_id)
def collate_fn(data):
def merge(sequences):
lengths = [len(seq) for seq in sequences]
padded_seqs = torch.zeros(len(sequences), 512).long()
for i, seq in enumerate(sequences):
end = lengths[i]
if end <= 512:
padded_seqs[i, :end] = seq[:end]
else:
padded_seqs[i] = seq[-512:]
return padded_seqs
def pos_merge(sequences):
lengths = [len(seq) for seq in sequences]
padded_seqs = torch.zeros(len(sequences), 512).long()
for i, seq in enumerate(sequences):
padded_seqs[i] = torch.tensor([i for i in range(512)])
return padded_seqs
src_seqs, seg_seqs, att_mask, pos_id, trgs = zip(*data)
src_seqs = merge(src_seqs)
seg_seqs = merge(seg_seqs)
att_mask = merge(att_mask)
pos_id = pos_merge(pos_id)
trgs = torch.tensor(trgs)
return src_seqs, seg_seqs, att_mask, pos_id, trgs
data_loader variable declaration in main.py is missing. So I tried to load data_loader as
for tokens_tensor, segments_tensors, att_mask, pos_id, trg in dataloader.collate_fn(bAbi_Dataset):
use collate_fn() function in data_loader.py, but it did not work. When I change it as above, it gives the following error:
Traceback (most recent call last):
File "main.py", line 27, in <module>
File "/content/BERT-for-bAbi-task/dataloader.py", line 133, in collate_fn
src_seqs, seg_seqs, att_mask, pos_id, trgs = zip(*data)
File "/usr/lib/python3.6/typing.py", line 682, in inner
return func(*args, **kwds)
File "/usr/lib/python3.6/typing.py", line 1107, in __getitem__
params = tuple(_type_check(p, msg) for p in params)
File "/usr/lib/python3.6/typing.py", line 1107, in <genexpr>
params = tuple(_type_check(p, msg) for p in params)
File "/usr/lib/python3.6/typing.py", line 374, in _type_check
raise TypeError(msg + " Got %.100r." % (arg,))
TypeError: Parameters to generic types must be types. Got 0.
Could anyone please help me how to correct the error?
I will just give you some pointers:
collate_fn
is not meant to be called with a dataset as argument. It is a special callback function given to a dataloader and used to collate batch elements into a batch.
Since bAbi_Dataset
in /dataloader.py
is defined as a torch.utils.data.Dataset
I would guess you are meant to initialize it instead. It is defined here as:
def __init__(self, trg_word2id, tenK=True, path = "tasks_1-20_v1-2", train=True)
There is another function build_trg_dics
in /dataloader.py
which is used to create the parse the content from files. You should take a look at them before setting the right arguments for bAbi_Dataset
.
Lastly, when you have your dataset initialized, you can attach a dataloader on it using torch.utils.data.DataLoader
. This would look like:
data_loader = DataLoader(dataset, batch_size=16)
At this point, you might even need to plug in the collate function provided in /dataloader.py
.
If you don't really know what you are doing, I would suggest you start with a working repository and work your way from there. Good luck!