Search code examples
pytorchbert-language-model

"for tokens_tensor, segments_tensors, att_mask, pos_id, trg in data_loader: NameError: name 'data_loader' is not defined"


I am trying to implement question answering model with a BERT transformer implemented by jugapuff. Link to the code: https://github.com/jugapuff/BERT-for-bAbi-task

After executing the main.py file which is written below as well, I m getting this error: "for tokens_tensor, segments_tensors, att_mask, pos_id, trg in data_loader: NameError: name 'data_loader' is not defined"

from dataloader import bAbi_Dataset
import torch
import torch.nn as nn
from model import model
from pytorch_transformers import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    print("GPU:" + str(torch.cuda.get_device_name(0)))
    
my_model = model()
my_model.to(device)

optimizer = AdamW(my_model.parameters())
criterion = nn.NLLLoss()


EPOCHS = 10
for epoch in range(1, EPOCHS+1):
    
    my_model.train()
    
    train_loss = 0
    length = 0
    for tokens_tensor, segments_tensors, att_mask, pos_id, trg in data_loader:
        output = my_model(tokens_tensor.to(device), segments_tensors.to(device), att_mask.to(device), pos_id.to(device))
        loss = criterion(output, trg.to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        length+=1
        train_loss += loss.item()
        if length % 10 == 0:
          print("\t\t{:3}/25000 : {}".format(length, train_loss / length))
        
    epoch_loss = train_loss / length
    print("##################")
    print("{} epoch Loss : {:.4f}".format(epoch, epoch_loss))
  

and data_loader.py is as

import os
import torch
import torch.utils.data as data
from pytorch_transformers import BertTokenizer

def _parse( file, only_supporting=False):
        data, story = [], []
        for line in file:
            tid, text = line.rstrip('\n').split(' ', 1)
            if tid == '1':
                story = []
            if text.endswith('.'):
                story.append(text[:])
            else:
                query, answer, supporting = (x.strip() for x in text.split('\t'))
                if only_supporting:
                    substory = [story[int(i) - 1] for i in supporting.split()]
                else:
                    substory = [x for x in story if x]
                data.append((substory, query[:-1], answer))
                story.append("")
        return data
    
def build_trg_dics(tenK=True, path="tasks_1-20_v1-2", train=True):
    
    if tenK:
        dirname = os.path.join(path, 'en-10k')
    else:
        dirname = os.path.join(path, 'en')

    for (dirpath, dirnames, filenames) in os.walk(dirname):
        filenames = filenames

    if train:
        filenames = [filename for filename in filenames if  "train.txt" in filename]
    else:
        filenames = [filename for filename in filenames if  "test.txt" in filename]

    temp = []
    for filename in filenames:
        f = open(os.path.join(dirname, filename), 'r')
        parsed =_parse(f)
        temp.extend([d[2] for d in parsed])
    temp = set(temp)
    
    trg_word2id = {word:i for i, word in enumerate(temp)}
    trg_id2word = {i:word for i, word in enumerate(temp)}
    return trg_word2id, trg_id2word


class bAbi_Dataset(data.Dataset):
    
    def __init__(self, trg_word2id, tenK=True, path = "tasks_1-20_v1-2", train=True):
        # joint is Default
        
        
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
        if tenK:
            dirname = os.path.join(path, 'en-10k')
        else:
            dirname = os.path.join(path, 'en')
            
        for (dirpath, dirnames, filenames) in os.walk(dirname):
            filenames = filenames
         
        if train:
            filenames = [filename for filename in filenames if  "train.txt" in filename]
        else:
            filenames = [filename for filename in filenames if  "test.txt" in filename]
        
        self.src = []
        self.trg = []
        
        for filename in filenames:
            f = open(os.path.join(dirname, filename), 'r')
            parsed = _parse(f)
            self.src.extend([d[:2] for d in parsed])
            self.trg.extend([trg_word2id[d[2]] for d in parsed])
        self.trg = torch.tensor(self.trg)
            
            
    def __getitem__(self, index):
        src_seq = self.src[index]
        trg = self.trg[index]
        src_seq, seg_seq, att_mask, pos_id = self.preprocess_sequence(src_seq)
        
        return src_seq, seg_seq, att_mask, pos_id, trg

    def __len__(self):
        return len(self.trg)
        
    def preprocess_sequence(self, seq):

        text =  ["[CLS]"] + list(seq[0]) + ["[SEP]"] + [seq[1]] + ["[SEP]"]

        tokenized_text = self.tokenizer.tokenize(" ".join(text))
        indexed_text = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        where_is_sep = indexed_text.index(102) + 1
        segment_ids = [0 ]* (where_is_sep) + [1] * (len(indexed_text)- where_is_sep)
        attention_mask = [1] *len(indexed_text)
        pos_id = [i for i in range(len(indexed_text))]
        
        return torch.tensor(indexed_text), torch.tensor(segment_ids), torch.tensor(attention_mask), torch.tensor(pos_id)
    
    

def collate_fn(data):
    def merge(sequences):
        lengths = [len(seq) for seq in sequences]
        padded_seqs = torch.zeros(len(sequences), 512).long()
        for i, seq in enumerate(sequences):
            
            
            end = lengths[i]
            if end <= 512:
                padded_seqs[i, :end] = seq[:end]
            else:
                padded_seqs[i] = seq[-512:]

        return padded_seqs
      
    def pos_merge(sequences):
        
        lengths = [len(seq) for seq in sequences]
        padded_seqs = torch.zeros(len(sequences), 512).long()
        for i, seq in enumerate(sequences):
            
            padded_seqs[i] = torch.tensor([i for i in range(512)])

        return padded_seqs
    
    src_seqs, seg_seqs, att_mask, pos_id, trgs = zip(*data)
    src_seqs = merge(src_seqs)
    seg_seqs = merge(seg_seqs)
    att_mask = merge(att_mask)
    pos_id = pos_merge(pos_id)
    trgs = torch.tensor(trgs)
    return src_seqs, seg_seqs, att_mask, pos_id, trgs

data_loader variable declaration in main.py is missing. So I tried to load data_loader as

for tokens_tensor, segments_tensors, att_mask, pos_id, trg in dataloader.collate_fn(bAbi_Dataset):

use collate_fn() function in data_loader.py, but it did not work. When I change it as above, it gives the following error:

Traceback (most recent call last):
  File "main.py", line 27, in <module>
  File "/content/BERT-for-bAbi-task/dataloader.py", line 133, in collate_fn
    src_seqs, seg_seqs, att_mask, pos_id, trgs = zip(*data)
  File "/usr/lib/python3.6/typing.py", line 682, in inner
    return func(*args, **kwds)
  File "/usr/lib/python3.6/typing.py", line 1107, in __getitem__
    params = tuple(_type_check(p, msg) for p in params)
  File "/usr/lib/python3.6/typing.py", line 1107, in <genexpr>
    params = tuple(_type_check(p, msg) for p in params)
  File "/usr/lib/python3.6/typing.py", line 374, in _type_check
    raise TypeError(msg + " Got %.100r." % (arg,))
TypeError: Parameters to generic types must be types. Got 0.

Could anyone please help me how to correct the error?


Solution

  • I will just give you some pointers:

    • collate_fn is not meant to be called with a dataset as argument. It is a special callback function given to a dataloader and used to collate batch elements into a batch.

    • Since bAbi_Dataset in /dataloader.py is defined as a torch.utils.data.Dataset I would guess you are meant to initialize it instead. It is defined here as:

      def __init__(self, trg_word2id, tenK=True, path = "tasks_1-20_v1-2", train=True)
      

      There is another function build_trg_dics in /dataloader.py which is used to create the parse the content from files. You should take a look at them before setting the right arguments for bAbi_Dataset.

    • Lastly, when you have your dataset initialized, you can attach a dataloader on it using torch.utils.data.DataLoader. This would look like:

      data_loader = DataLoader(dataset, batch_size=16)
      

      At this point, you might even need to plug in the collate function provided in /dataloader.py.

    If you don't really know what you are doing, I would suggest you start with a working repository and work your way from there. Good luck!