Search code examples
python-3.xtokenhuggingface-transformersnamed-entity-recognition

How to resolve the error name:_name = "label" if "label" in features[0].keys() else "labels" in Hugging face NER


I am trying to run custom NER on my data using offset values. I tried to replicate using this link << https://huggingface.co/course/chapter7/2 >>

I keep getting the error

variable name:_name = "label" if "label" in features[0].keys() else "labels"

DATA BEFORE tokenize_and_align_labels FUNCTIONS

{'texts': ['WASHINGTON USA WA DRIVER LICENSE BESSETTE Lamma 4d DL 73235766 9 Class AM to Iss 22/03/2021 Ab Exp 07130/2021 DOB 2/28/21 1 BESSETTE 2 GERALD 8 6930 NE Grandview Blvd, keyport, WA 86494 073076 12 Restrictions A 9a End P 16 Hgt 5\'-04" 15 Sex F 18 Eyes BLU 5 DD 73235766900000000000 Gerald Bessette', ] }
tag_names': [
    
[
{'start': 281, 'end': 296, 'tag': 'PERSON_NAME', 'text': 'Gerald Bessette'}, 
{'start': 135, 'end': 141, 'tag': 'FIRST_NAME', 'text': 'GERALD'}, 
{'start': 124, 'end': 122, 'tag': 'LAST_NAME', 'text': 'BESSETTE'}, 
{'start': 81, 'end': 81, 'tag': 'ISSUE_DATE', 'text': '22/03/2021'}, 
{'start': 99, 'end': 109, 'tag': 'EXPIRY_DATE', 'text': '07130/2021'}, 
{'start': 114, 'end': 121, 'tag': 'DATE_OF_BIRTH', 'text': '2/28/21'}, 
 {'start': 51, 'end': 59, 'tag': 'DRIVER_LICENSE_NUMBER', 'text': '73235766'}, 
{'start': 144, 'end': 185, 'tag': 'ADDRESS', 'text': '6930 NE Grandview Blvd, keyport, WA 86494'}
 ], 
    

DATA AFTER tokenize_and_align_labels FUNCTIONS

{'input_ids': 
 [[0, 305, 8684, 2805, 9342, 10994, 26994, 42560, 39951, 163, 12147, 3935, 6433, 6887, 1916, 204, 417, 13925, 6521, 1922, 4390, 4280, 361, 
   4210, 3326, 7, 19285, 820, 73, 3933, 73, 844, 2146, 2060, 12806, 321, 5339, 541, 73, 844, 2146, 14010, 387, 132, 73, 2517, 73, 2146, 112, 
   163, 12147, 3935, 6433, 132, 272, 39243, 495, 290, 5913, 541, 12462, 2374, 5877, 12543, 6, 762, 3427, 6, 9342, 290, 4027, 6405, 13470, 541, 
   5067, 316, 40950, 2485, 83, 361, 102, 4680, 221, 545, 289, 19377, 195, 32269, 3387, 113, 379, 15516, 274, 504, 26945, 12413, 791, 195, 27932,
   6521, 1922, 4390, 36400, 45947, 151, 14651, 163, 3361, 3398, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 


'attention_mask': 
    [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
    
'offset_mapping': [[(0, 0), (0, 1), (1, 10), (11, 14), (15, 17), (18, 20), (20, 24), (25, 28), (28, 32), (33, 34), (34, 37), (37, 39), (39, 41), 
                    (42, 45), (45, 47), (48, 49), (49, 50), (51, 53), (54, 56), (56, 58), (58, 60), (60, 62), (63, 64), (65, 70), (71, 73), 
                    (74, 76), (77, 80), (81, 83), (83, 84), (84, 86), (86, 87), (87, 89), (89, 91), (92, 94), (95, 98), (99, 100), (100, 102), 
                    (102, 104), (104, 105), (105, 107), (107, 109), (110, 112), (112, 113), (114, 115), (115, 116), (116, 118), (118, 119), 
                    (119, 121), (122, 123), (124, 125), (125, 128), (128, 130), (130, 132), (133, 134), (135, 136), (136, 140), (140, 141), 
                    (142, 143), (144, 146), (146, 148), (149, 151), (152, 157), (157, 161), (162, 166), (166, 167), (168, 171), (171, 175), 
                    (175, 176), (177, 179), (180, 181), (181, 183), (183, 185), (186, 188), (188, 190), (190, 192), (193, 195), (196, 204), 
                    (204, 208), (209, 210), (211, 212), (212, 213), (214, 217), (218, 219), (220, 222), (223, 224), (224, 226), (227, 228), 
                    (228, 230), (230, 232), (232, 233), (234, 236), (237, 240), (241, 242), (243, 245), (246, 250), (251, 253), (253, 254), 
                    (255, 256), (257, 259), (260, 262), (262, 264), (264, 266), (266, 269), (269, 277), (277, 280), (281, 287), (288, 289), 
                    (289, 292), (292, 296), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), 
                    (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), 
                    (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), 
                    (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), 
                    (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), 
                    (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), 
                    (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), 
                    (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), 
                    (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), 
                    (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]

'labels': [[24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 2, 10, 10, 18, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
            24, 24, 3, 11, 11, 11, 11, 19, 24, 24, 1, 9, 9, 9, 17, 24, 24, 24, 24, 24, 24, 4, 12, 20, 24, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
            16, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
            24, 7, 15, 15, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
            24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24], 

My Code:

import transformers
from transformers import AutoTokenizer
from transformers import AutoTokenizer,BertModel,BertTokenizer
from transformers import RobertaModel,RobertaConfig,RobertaForTokenClassification
from transformers import TrainingArguments, Trainer
# from transformers.trainer import get_tpu_sampler
from transformers.trainer_pt_utils import get_tpu_sampler
from transformers.data.data_collator import DataCollator, InputDataClass
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification

import torch
from torch.nn import CrossEntropyLoss, MSELoss
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler

from torchcrf import CRF

import dataclasses
import logging
import warnings
import tqdm
import os
import numpy as np
from typing import List, Union, Dict
os.environ["WANDB_DISABLED"] = "true"
print(transformers.__version__)

import evaluate
metric = evaluate.load("seqeval")

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) #add_prefix_space=True


def isin(a, b):
    return a[1] > b[0] and a[0] < b[1]


def tokenize_and_align_labels(examples, label2id, max_length=256):
    tokenized_inputs = tokenizer(examples["texts"], truncation=True, padding='max_length', max_length=max_length,return_offsets_mapping=True)
    print("tokenization done")

    labels = []
    for i, label_idx_for_single_input in enumerate(tqdm.tqdm(examples["tag_names"])):

        #         print(i,label_idx_for_single_input)

        labels_for_single_input = ['O' for _ in range(max_length)]
        #         print(labels_for_single_input)
        text_offsets = tokenized_inputs['offset_mapping'][i]
        #         print("text_offsets",text_offsets)
        for entity in label_idx_for_single_input:
            #             print("entity",entity)
            tag = entity['tag']
            #             print("tag",tag)
            tag_offset = [entity['start'], entity['end']]
            #             print("tag_offset",tag_offset)

            #             text_offsets [(0, 0), (0, 1), (1, 10), (11, 14), (15, 17), (18, 20), (20, 24), (25, 28), (28, 32), (33, 34), (34, 37), (37, 39), (39, 41), (42, 45), (45, 47), (48, 49), (49, 50), (51, 53), (54, 56), (56, 58), (58, 60), (60, 62), (63, 64), (65, 70), (71, 73), (74, 76), (77, 80), (81, 83), (83, 84), (84, 86), (86, 87), (87, 89), (89, 91), (92, 94), (95, 98), (99, 100), (100, 102), (102, 104), (104, 105), (105, 107), (107, 109), (110, 112), (112, 113), (114, 115), (115, 116), (116, 118), (118, 119), (119, 121), (122, 123), (124, 125), (125, 128), (128, 130), (130, 132), (133, 134), (135, 136), (136, 140), (140, 141), (142, 143), (144, 146), (146, 148), (149, 151), (152, 157), (157, 161), (162, 166), (166, 167), (168, 171), (171, 175), (175, 176), (177, 179), (180, 181), (181, 183), (183, 185), (186, 188), (188, 190), (190, 192), (193, 195), (196, 204), (204, 208), (209, 210), (211, 212), (212, 213), (214, 217), (218, 219), (220, 222), (223, 224), (224, 226), (227, 228), (228, 230), (230, 232), (232, 233), (234, 236), (237, 240), (241, 242), (243, 245), (246, 250), (251, 253), (253, 254), (255, 256), (257, 259), (260, 262), (262, 264), (264, 266), (266, 269), (269, 277), (277, 280), (281, 287), (288, 289), (289, 292), (292, 296), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]
            #             entity {'start': 281, 'end': 296, 'tag': 'PERSON_NAME', 'text': 'Gerald Bessette'}
            #             tag PERSON_NAME
            #             tag_offset [281, 296]

            affected_token_ids = [j for j in range(max_length) if isin(tag_offset, text_offsets[j])]
            #             print("affected_token_ids",affected_token_ids)

            if len(affected_token_ids) < 1:
                #                 print('affected_token_ids)<1')
                continue
            if any(labels_for_single_input[j] != 'O' for j in affected_token_ids):
                #                 print('entity orverlap! skipping')
                continue

            for j in affected_token_ids:
                labels_for_single_input[j] = 'I_' + tag
            labels_for_single_input[affected_token_ids[-1]] = 'L_' + tag
            labels_for_single_input[affected_token_ids[0]] = 'B_' + tag

        label_ids = [label2id[x] for x in labels_for_single_input]
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    # print(tokenized_inputs.keys())
    return tokenized_inputs


import json

data = []
with open('data.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))

l = []
for k, v in data[0].items():
    l.append({'text': k, 'spans': v})

train_set = [
    [
        x['text'],
        [{'start': y["start"], 'end': y["end"], 'tag': y["label"], 'text': y["ngram"]} for y in x['spans']]
    ] for x in l
]

## count labels in dataset
from collections import Counter
e = []
for x in train_set:
    for y in x[1]:
        e.append(y['tag'])
Counter(e).most_common()

## get label list
ori_label_list = []
for line in train_set:
    ori_label_list += [entity['tag'] for entity in line[1]]

ori_label_list = sorted(list(set(ori_label_list)))

label_list = []
for prefix in 'BIL':
    label_list += [prefix + '_' + x for x in ori_label_list]
label_list += ['O']
label_list = sorted(list(set(label_list)))
print(label_list)

print(len(label_list))

label2id = {n:i for i,n in enumerate(label_list)}
id2label= {str(i):n for i,n in enumerate(label_list)}

# id2label = {str(i): label for i, label in enumerate(label_names)}
# label2id = {v: k for k, v in id2label.items()}

train_examples ={'texts':[x[0] for x in train_set],'tag_names':[x[1] for x in train_set]}

train_examples = tokenize_and_align_labels(train_examples,label2id)
# train_examples = train_examples.map(tokenize_and_align_labels(label2id),batched=True)
print("here")
print(train_examples.keys())
print(len(train_examples['labels']))

# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels'])
# 775
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
# collator=data_collator(train_examples)

# def compute_metrics(eval_preds):
#     logits, labels = eval_preds
#     predictions = np.argmax(logits, axis=-1)
#
#     # Remove ignored index (special tokens) and convert to labels
#     true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
#     true_predictions = [
#         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
#     all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
#     return {
#         "precision": all_metrics["overall_precision"],
#         "recall": all_metrics["overall_recall"],
#         "f1": all_metrics["overall_f1"],
#         "accuracy": all_metrics["overall_accuracy"],
#     }

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint,id2label=id2label,label2id=label2id,)

print(model.config.num_labels)



args = TrainingArguments(
    "bert-finetuned-ner",
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=2,
    weight_decay=0.01,
    # push_to_hub=True,
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_examples,
    # eval_dataset=train_examples,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
    tokenizer=tokenizer)
trainer.train()


ERROR

 _name = "label" if "label" in features[0].keys() else "labels"
AttributeError: 'tokenizers.Encoding' object has no attribute 'keys'

Solution

  • I think the object tokenized_inputs that you create and return in tokenize_and_align_labels is likely to be a tokenizers.Encoding object, not a dict or Dataset object (check this by printing type(myobject) when in doubt), and therefore it won't have keys.

    You should apply your Tokenizer to your examples using the map function of Dataset, as in this example from the documentation.