python nlp huggingface-transformers named-entity-recognition huggingface-datasets

Creating HuggingFace Dataset to train an BIO tagger

I have a list of dictionaries:

sentences = [ 
{'text': ['I live in Madrid'], 'labels':[O, O, O, B-LOC]},
{'text': ['Peter lives in Spain'], 'labels':[B-PER, O, O, B-LOC]},
{'text': ['He likes pasta'], 'labels':[O, O, B-FOOD]},
...
]

I want to create a HuggingFace dataset object from this data so that I can later preprocess it and feed to a transformer model much more easily, but so far I have not found a viable way to do this.

Solution

First you'll need some extra libraries to use the metrics and datasets features.

pip install -U transformers datasets evaluate seqeval

To convert list of dict to Dataset object

import pandas as pd
from datasets import Dataset

sentences = [ 
{'text': 'I live in Madrid', 'labels':['O', 'O', 'O', 'B-LOC']},
{'text': 'Peter lives in Spain', 'labels':['B-PER', 'O', 'O', 'B-LOC']},
{'text': 'He likes pasta', 'labels':['O', 'O', 'B-FOOD']},
]


ds = Dataset.from_pandas(pd.DataFrame(data=sentences))

Convert the dataset into a "Trainer-able" Dataset object

from datasets import Dataset
from datasets import ClassLabel

# Define a Classlabel object to use to map string labels to integers.
classmap = ClassLabel(num_classes=4, names=['B-LOC', 'B-PER', 'B-FOOD', 'O'])


train_sentences = [ 
{'text': 'I live in Madrid', 'labels':['O', 'O', 'O', 'B-LOC']},
{'text': 'Peter lives in Spain', 'labels':['B-PER', 'O', 'O', 'B-LOC']},
{'text': 'He likes pasta', 'labels':['O', 'O', 'B-FOOD']},
]

# Map text to tokenizer ids.
ds = ds.map(lambda x: tokenizer(x["text"], truncation=True))

# Map labels to label ids.
ds = ds.map(lambda y: {"labels": classmap.str2int(y["labels"])})

To compute metrics with the labeled inputs that you have:

import evaluate

metric = evaluate.load("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=2)
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

To use with the `Trainer` object

import pandas as pd
import evaluate

from datasets import Dataset
from datasets import ClassLabel

from transformers import AutoModelForTokenClassification, Trainer, AutoTokenizer, DataCollatorForTokenClassification

# Define a Classlabel object to use to map string labels to integers.
classmap = ClassLabel(num_classes=4, names=['B-LOC', 'B-PER', 'B-FOOD', 'O'])

train_sentences = [ 
{'text': 'I live in Madrid', 'labels':['O', 'O', 'O', 'B-LOC']},
{'text': 'Peter lives in Spain', 'labels':['B-PER', 'O', 'O', 'B-LOC']},
{'text': 'He likes pasta', 'labels':['O', 'O', 'B-FOOD']},
]

eval_sentences = [
    {"text": "I like pasta from Madrid , Spain", 'labels': ['O', 'O', 'B-FOOD', 'O', 'B-LOC', 'O', 'B-LOC']}
]

ds_train = Dataset.from_pandas(pd.DataFrame(data=train_sentences))
ds_eval = Dataset.from_pandas(pd.DataFrame(data=eval_sentences))

model = AutoModelForTokenClassification.from_pretrained("distilbert-base-multilingual-cased",
                                                        id2label={i:classmap.int2str(i) for i in range(classmap.num_classes)},
                                                        label2id={c:classmap.str2int(c) for c in classmap.names},
                                                        finetuning_task="ner")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
data_collator = DataCollatorForTokenClassification(tokenizer)


ds_train = ds_train.map(lambda x: tokenizer(x["text"], truncation=True))
ds_eval = ds_eval.map(lambda x: tokenizer(x["text"], truncation=True))

ds_train = ds_train.map(lambda y: {"labels": classmap.str2int(y["labels"])})
ds_eval = ds_eval.map(lambda y: {"labels": classmap.str2int(y["labels"])})


metric = evaluate.load("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=2)
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Initialize our Trainer
trainer = Trainer(
    model=model,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


trainer.train()

Creating HuggingFace Dataset to train an BIO tagger

To convert list of dict to Dataset object

Convert the dataset into a "Trainer-able" Dataset object

To compute metrics with the labeled inputs that you have:

To use with the Trainer object

To use with the `Trainer` object