I am trying to run custom NER on my data using offset values. I tried to replicate using this link << https://huggingface.co/course/chapter7/2 >>
I keep getting the error
variable name:_name = "label" if "label" in features[0].keys() else "labels"
DATA BEFORE tokenize_and_align_labels FUNCTIONS
{'texts': ['WASHINGTON USA WA DRIVER LICENSE BESSETTE Lamma 4d DL 73235766 9 Class AM to Iss 22/03/2021 Ab Exp 07130/2021 DOB 2/28/21 1 BESSETTE 2 GERALD 8 6930 NE Grandview Blvd, keyport, WA 86494 073076 12 Restrictions A 9a End P 16 Hgt 5\'-04" 15 Sex F 18 Eyes BLU 5 DD 73235766900000000000 Gerald Bessette', ] }
tag_names': [
[
{'start': 281, 'end': 296, 'tag': 'PERSON_NAME', 'text': 'Gerald Bessette'},
{'start': 135, 'end': 141, 'tag': 'FIRST_NAME', 'text': 'GERALD'},
{'start': 124, 'end': 122, 'tag': 'LAST_NAME', 'text': 'BESSETTE'},
{'start': 81, 'end': 81, 'tag': 'ISSUE_DATE', 'text': '22/03/2021'},
{'start': 99, 'end': 109, 'tag': 'EXPIRY_DATE', 'text': '07130/2021'},
{'start': 114, 'end': 121, 'tag': 'DATE_OF_BIRTH', 'text': '2/28/21'},
{'start': 51, 'end': 59, 'tag': 'DRIVER_LICENSE_NUMBER', 'text': '73235766'},
{'start': 144, 'end': 185, 'tag': 'ADDRESS', 'text': '6930 NE Grandview Blvd, keyport, WA 86494'}
],
DATA AFTER tokenize_and_align_labels FUNCTIONS
{'input_ids':
[[0, 305, 8684, 2805, 9342, 10994, 26994, 42560, 39951, 163, 12147, 3935, 6433, 6887, 1916, 204, 417, 13925, 6521, 1922, 4390, 4280, 361,
4210, 3326, 7, 19285, 820, 73, 3933, 73, 844, 2146, 2060, 12806, 321, 5339, 541, 73, 844, 2146, 14010, 387, 132, 73, 2517, 73, 2146, 112,
163, 12147, 3935, 6433, 132, 272, 39243, 495, 290, 5913, 541, 12462, 2374, 5877, 12543, 6, 762, 3427, 6, 9342, 290, 4027, 6405, 13470, 541,
5067, 316, 40950, 2485, 83, 361, 102, 4680, 221, 545, 289, 19377, 195, 32269, 3387, 113, 379, 15516, 274, 504, 26945, 12413, 791, 195, 27932,
6521, 1922, 4390, 36400, 45947, 151, 14651, 163, 3361, 3398, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
'attention_mask':
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'offset_mapping': [[(0, 0), (0, 1), (1, 10), (11, 14), (15, 17), (18, 20), (20, 24), (25, 28), (28, 32), (33, 34), (34, 37), (37, 39), (39, 41),
(42, 45), (45, 47), (48, 49), (49, 50), (51, 53), (54, 56), (56, 58), (58, 60), (60, 62), (63, 64), (65, 70), (71, 73),
(74, 76), (77, 80), (81, 83), (83, 84), (84, 86), (86, 87), (87, 89), (89, 91), (92, 94), (95, 98), (99, 100), (100, 102),
(102, 104), (104, 105), (105, 107), (107, 109), (110, 112), (112, 113), (114, 115), (115, 116), (116, 118), (118, 119),
(119, 121), (122, 123), (124, 125), (125, 128), (128, 130), (130, 132), (133, 134), (135, 136), (136, 140), (140, 141),
(142, 143), (144, 146), (146, 148), (149, 151), (152, 157), (157, 161), (162, 166), (166, 167), (168, 171), (171, 175),
(175, 176), (177, 179), (180, 181), (181, 183), (183, 185), (186, 188), (188, 190), (190, 192), (193, 195), (196, 204),
(204, 208), (209, 210), (211, 212), (212, 213), (214, 217), (218, 219), (220, 222), (223, 224), (224, 226), (227, 228),
(228, 230), (230, 232), (232, 233), (234, 236), (237, 240), (241, 242), (243, 245), (246, 250), (251, 253), (253, 254),
(255, 256), (257, 259), (260, 262), (262, 264), (264, 266), (266, 269), (269, 277), (277, 280), (281, 287), (288, 289),
(289, 292), (292, 296), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),
(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]
'labels': [[24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 2, 10, 10, 18, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 3, 11, 11, 11, 11, 19, 24, 24, 1, 9, 9, 9, 17, 24, 24, 24, 24, 24, 24, 4, 12, 20, 24, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
16, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 7, 15, 15, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24],
My Code:
import transformers
from transformers import AutoTokenizer
from transformers import AutoTokenizer,BertModel,BertTokenizer
from transformers import RobertaModel,RobertaConfig,RobertaForTokenClassification
from transformers import TrainingArguments, Trainer
# from transformers.trainer import get_tpu_sampler
from transformers.trainer_pt_utils import get_tpu_sampler
from transformers.data.data_collator import DataCollator, InputDataClass
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
import torch
from torch.nn import CrossEntropyLoss, MSELoss
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler
from torchcrf import CRF
import dataclasses
import logging
import warnings
import tqdm
import os
import numpy as np
from typing import List, Union, Dict
os.environ["WANDB_DISABLED"] = "true"
print(transformers.__version__)
import evaluate
metric = evaluate.load("seqeval")
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) #add_prefix_space=True
def isin(a, b):
return a[1] > b[0] and a[0] < b[1]
def tokenize_and_align_labels(examples, label2id, max_length=256):
tokenized_inputs = tokenizer(examples["texts"], truncation=True, padding='max_length', max_length=max_length,return_offsets_mapping=True)
print("tokenization done")
labels = []
for i, label_idx_for_single_input in enumerate(tqdm.tqdm(examples["tag_names"])):
# print(i,label_idx_for_single_input)
labels_for_single_input = ['O' for _ in range(max_length)]
# print(labels_for_single_input)
text_offsets = tokenized_inputs['offset_mapping'][i]
# print("text_offsets",text_offsets)
for entity in label_idx_for_single_input:
# print("entity",entity)
tag = entity['tag']
# print("tag",tag)
tag_offset = [entity['start'], entity['end']]
# print("tag_offset",tag_offset)
# text_offsets [(0, 0), (0, 1), (1, 10), (11, 14), (15, 17), (18, 20), (20, 24), (25, 28), (28, 32), (33, 34), (34, 37), (37, 39), (39, 41), (42, 45), (45, 47), (48, 49), (49, 50), (51, 53), (54, 56), (56, 58), (58, 60), (60, 62), (63, 64), (65, 70), (71, 73), (74, 76), (77, 80), (81, 83), (83, 84), (84, 86), (86, 87), (87, 89), (89, 91), (92, 94), (95, 98), (99, 100), (100, 102), (102, 104), (104, 105), (105, 107), (107, 109), (110, 112), (112, 113), (114, 115), (115, 116), (116, 118), (118, 119), (119, 121), (122, 123), (124, 125), (125, 128), (128, 130), (130, 132), (133, 134), (135, 136), (136, 140), (140, 141), (142, 143), (144, 146), (146, 148), (149, 151), (152, 157), (157, 161), (162, 166), (166, 167), (168, 171), (171, 175), (175, 176), (177, 179), (180, 181), (181, 183), (183, 185), (186, 188), (188, 190), (190, 192), (193, 195), (196, 204), (204, 208), (209, 210), (211, 212), (212, 213), (214, 217), (218, 219), (220, 222), (223, 224), (224, 226), (227, 228), (228, 230), (230, 232), (232, 233), (234, 236), (237, 240), (241, 242), (243, 245), (246, 250), (251, 253), (253, 254), (255, 256), (257, 259), (260, 262), (262, 264), (264, 266), (266, 269), (269, 277), (277, 280), (281, 287), (288, 289), (289, 292), (292, 296), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]
# entity {'start': 281, 'end': 296, 'tag': 'PERSON_NAME', 'text': 'Gerald Bessette'}
# tag PERSON_NAME
# tag_offset [281, 296]
affected_token_ids = [j for j in range(max_length) if isin(tag_offset, text_offsets[j])]
# print("affected_token_ids",affected_token_ids)
if len(affected_token_ids) < 1:
# print('affected_token_ids)<1')
continue
if any(labels_for_single_input[j] != 'O' for j in affected_token_ids):
# print('entity orverlap! skipping')
continue
for j in affected_token_ids:
labels_for_single_input[j] = 'I_' + tag
labels_for_single_input[affected_token_ids[-1]] = 'L_' + tag
labels_for_single_input[affected_token_ids[0]] = 'B_' + tag
label_ids = [label2id[x] for x in labels_for_single_input]
labels.append(label_ids)
tokenized_inputs["labels"] = labels
# print(tokenized_inputs.keys())
return tokenized_inputs
import json
data = []
with open('data.json', 'r') as f:
for line in f:
data.append(json.loads(line))
l = []
for k, v in data[0].items():
l.append({'text': k, 'spans': v})
train_set = [
[
x['text'],
[{'start': y["start"], 'end': y["end"], 'tag': y["label"], 'text': y["ngram"]} for y in x['spans']]
] for x in l
]
## count labels in dataset
from collections import Counter
e = []
for x in train_set:
for y in x[1]:
e.append(y['tag'])
Counter(e).most_common()
## get label list
ori_label_list = []
for line in train_set:
ori_label_list += [entity['tag'] for entity in line[1]]
ori_label_list = sorted(list(set(ori_label_list)))
label_list = []
for prefix in 'BIL':
label_list += [prefix + '_' + x for x in ori_label_list]
label_list += ['O']
label_list = sorted(list(set(label_list)))
print(label_list)
print(len(label_list))
label2id = {n:i for i,n in enumerate(label_list)}
id2label= {str(i):n for i,n in enumerate(label_list)}
# id2label = {str(i): label for i, label in enumerate(label_names)}
# label2id = {v: k for k, v in id2label.items()}
train_examples ={'texts':[x[0] for x in train_set],'tag_names':[x[1] for x in train_set]}
train_examples = tokenize_and_align_labels(train_examples,label2id)
# train_examples = train_examples.map(tokenize_and_align_labels(label2id),batched=True)
print("here")
print(train_examples.keys())
print(len(train_examples['labels']))
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels'])
# 775
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
# collator=data_collator(train_examples)
# def compute_metrics(eval_preds):
# logits, labels = eval_preds
# predictions = np.argmax(logits, axis=-1)
#
# # Remove ignored index (special tokens) and convert to labels
# true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
# true_predictions = [
# [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
# for prediction, label in zip(predictions, labels)
# ]
# all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
# return {
# "precision": all_metrics["overall_precision"],
# "recall": all_metrics["overall_recall"],
# "f1": all_metrics["overall_f1"],
# "accuracy": all_metrics["overall_accuracy"],
# }
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint,id2label=id2label,label2id=label2id,)
print(model.config.num_labels)
args = TrainingArguments(
"bert-finetuned-ner",
# evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=2,
weight_decay=0.01,
# push_to_hub=True,
)
trainer = Trainer(
model=model,
args=args,
train_dataset=train_examples,
# eval_dataset=train_examples,
data_collator=data_collator,
# compute_metrics=compute_metrics,
tokenizer=tokenizer)
trainer.train()
ERROR
_name = "label" if "label" in features[0].keys() else "labels"
AttributeError: 'tokenizers.Encoding' object has no attribute 'keys'
I think the object tokenized_inputs
that you create and return in tokenize_and_align_labels
is likely to be a tokenizers.Encoding
object, not a dict
or Dataset
object (check this by printing type(myobject)
when in doubt), and therefore it won't have keys.
You should apply your Tokenizer to your examples using the map
function of Dataset, as in this example from the documentation.