Search code examples
pythonspacyspacy-3spacy-transformers

How to register custom components in a SpaCy config.cfg file?


As the title states: I seem to have followed the documentation as described and I have looked all over the web for a useful answer but have so far have not found much. Any help is much appreciated! Thank you!

I am running the command:

python -m spacy debug config config.cfg --code 'matcher.py' --code 'sentence.py'              

and

python -m spacy train 'config.cfg' --output 'config\' --code 'sentence.py' --code 'matcher.py'

Both get the same error:

ValueError: [E002] Can't find factory for 'sentence_splitter' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a
 custom component name that's not registered on the current language class. If you're using a Transformer, make sure to install 'spacy-transformers'. I
f you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, le
mmatizer, trainable_lemmatizer, entity_linker, entity_ruler, tagger, morphologizer, ner, beam_ner, senter, sentencizer, spancat, spancat_singlelabel, span_finder, future_entity_ruler, span_ruler, textcat, textcat_multilabel, matcher, en.lemmatizer

Here is my config file:

[paths]
train = "output_data.spacy"
dev = "output_data.spacy"
vectors = null
init_tok2vec = null

[system]
gpu_allocator = null
seed = 0

[nlp]
lang = "en"
pipeline = ["tok2vec","ner","tagger","sentence_splitter", "parser", "senter","attribute_ruler","matcher","lemmatizer","spacytextblob"]
disabled = ["senter", "tagger", "attribute_ruler","spacytextblob"]
before_creation = null
after_creation = null
after_pipeline_creation = null
batch_size = 256
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}

[components]

[components.sentence_splitter]
factory = "sentence_splitter"



[components.attribute_ruler]
factory = "attribute_ruler"
scorer = {"@scorers":"spacy.attribute_ruler_scorer.v1"}
validate = false

[components.lemmatizer]
factory = "lemmatizer"
mode = "rule"
model = null
overwrite = false
scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}

[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"

[components.ner.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 96
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,1000,2500,2500]
include_static_vectors = true

[components.ner.model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3

[components.parser]
factory = "parser"
learn_tokens = false
min_action_freq = 30
moves = null
scorer = {"@scorers":"spacy.parser_scorer.v1"}
update_with_oracle_cut_size = 100

[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "parser"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null

[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
upstream = "tok2vec"

[components.senter]
factory = "senter"
overwrite = false
scorer = {"@scorers":"spacy.senter_scorer.v1"}

[components.senter.model]
@architectures = "spacy.Tagger.v2"
nO = null
normalize = false

[components.senter.model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"

[components.senter.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 16
attrs = ["NORM","PREFIX","SUFFIX","SHAPE","SPACY"]
rows = [1000,500,500,500,50]
include_static_vectors = true

[components.senter.model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 16
depth = 2
window_size = 1
maxout_pieces = 2

[components.spacytextblob]
factory = "spacytextblob"
blob_only = false
custom_blob = null

[components.tagger]
factory = "tagger"
label_smoothing = 0.0
neg_prefix = "!"
overwrite = false
scorer = {"@scorers":"spacy.tagger_scorer.v1"}

[components.tagger.model]
@architectures = "spacy.Tagger.v2"
nO = null
normalize = false

[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
upstream = "tok2vec"

[components.matcher]
factory = "matcher"

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode:width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE","SPACY","IS_SPACE"]
rows = [5000,1000,2500,2500,50,50]
include_static_vectors = true

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3

[corpora]

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
gold_preproc = false
max_length = 0
limit = 0
augmenter = null

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
gold_preproc = false
max_length = 0
limit = 0
augmenter = null

[training]
train_corpus = "corpora.train"
dev_corpus = "corpora.dev"
seed = ${system:seed}
gpu_allocator = ${system:gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 5000
max_epochs = 0
max_steps = 100000
eval_frequency = 1000
frozen_components = []
before_to_disk = null
annotating_components = []
before_update = null

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null

[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 0.00000001
learn_rate = 0.001

[training.score_weights]
tag_acc = 0.16
dep_uas = 0.0
dep_las = 0.16
dep_las_per_type = null
sents_p = null
sents_r = null
sents_f = 0.02
lemma_acc = 0.5
ents_f = 0.16
ents_p = 0.0
ents_r = 0.0
ents_per_type = null
speed = 0.0

[pretraining]

[initialize]
vocab_data = null
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
before_init = null
after_init = null

[initialize.components]

[initialize.components.ner]

[initialize.components.ner.labels]
@readers = "spacy.read_labels.v1"
path = "corpus/labels/ner.json"
require = false

[initialize.components.parser]

[initialize.components.parser.labels]
@readers = "spacy.read_labels.v1"
path = "corpus/labels/parser.json"
require = false

[initialize.components.tagger]

[initialize.components.tagger.labels]
@readers = "spacy.read_labels.v1"
path = "corpus/labels/tagger.json"
require = false

[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]

[initialize.tokenizer]

sentence.py file

import spacy
from spacy.language import Language
import re

@Language.component("sentence_splitter") # stateless
def sentence_splitter(doc):
    start = 0
    i = 0
    # print("Processing custom_sentence_splitter_improved")
    #delimiter_pattern = re.compile(r"(\r?\n)+|(\n)+")  # This is the magic regex
    delimiter_pattern = re.compile(r"(\r?\n\s*)+|(\n\s*)+")
    while i < len(doc):
        if delimiter_pattern.fullmatch(doc[i].text):
            # print(f"Found delimiter '{doc[i].text}' at position {i}")
            for token in doc[start:i]:
                token.sent_start = False
            doc[i].sent_start = True
            start = i + 1

            # Skip consecutive occurrences of '\r' and '\n'
            while i + 1 < len(doc) and delimiter_pattern.fullmatch(doc[i + 1].text):
                doc[i + 1].sent_start = False
                i += 1
        else:
            doc[i].sent_start = False
        i += 1

    for token in doc[start:]:
        token.sent_start = False

    return doc

# Used to add the custom component to the pipeline
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("sentence_splitter", name="sentence_splitter", after='ner')

matcher.py file

import spacy
from spacy.language import Language
import re
from spacy.matcher import Matcher
from spacy.tokens import Token

@Language.factory("matcher")# stateful
def create_template_matcher(nlp, name):
    return TemplateMatcher(nlp.vocab)

class TemplateMatcher:
    def __init__(self, vocab):
        # Define multiple patterns
        patterns1 = [blar blar blar ]
        patterns2 = [blar blar blar ]
        patterns3 = [blar blar blar ]
        patterns4 = [blar blar blar ]

        Token.set_extension("templates", default=False, force=True)  # Register a new token extension to flag matched patterns
        self.matcher = Matcher(vocab)
        self.matcher.add("patterns1", patterns1)
        self.matcher.add("patterns2", patterns2)
        self.matcher.add("patterns3", patterns3)
        self.matcher.add("patterns4", patterns4)

    def __call__(self, doc):
        matches = self.matcher(doc)
        for match_id, start, end in matches:
            for token in doc[start:end]:
                token._.templates = True
        return doc

# Used to add the custom component to the pipeline
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("matcher", name="matcher", after ='parser')

Solution

  • Per the docs:

    The --code argument can be used to provide a Python file that’s imported before the training process starts.

    Easy to miss, but it does say "file" (singular) rather than "files". I don't think you can supply the --code argument >1 times with a different Python file for each.

    However, if you add both of your custom components to the same module, e.g. custom_components.py, and run python -m spacy init fill-config config.cfg config.cfg --code custom_components.py this should tell you if anything is wrong with your config. And if nothing is wrong, then you can proceed with training using python -m spacy train config.cfg --output config/ --code custom_componets.py

    """Contents of custom_components.py
    
    Notes:
        - I had to import `SpacyTextBlob` for the config to know what 
        factory to use
        - I had to modify your `TemplateMatcher` to get the `fill-config` 
        command to work.
    """
    
    import re
    
    from spacy.language import Language
    from spacy.matcher import Matcher
    from spacy.tokens import Token
    # NOTE: have to import `SpacyTextBlob` for config file to work
    from spacytextblob.spacytextblob import SpacyTextBlob
    
    
    @Language.component("sentence_splitter")  # stateless
    def sentence_splitter(doc):
        start = 0
        i = 0
        # print("Processing custom_sentence_splitter_improved")
        #delimiter_pattern = re.compile(r"(\r?\n)+|(\n)+")  # This is the magic regex
        delimiter_pattern = re.compile(r"(\r?\n\s*)+|(\n\s*)+")
        while i < len(doc):
            if delimiter_pattern.fullmatch(doc[i].text):
                # print(f"Found delimiter '{doc[i].text}' at position {i}")
                for token in doc[start:i]:
                    token.sent_start = False
                doc[i].sent_start = True
                start = i + 1
    
                # Skip consecutive occurrences of '\r' and '\n'
                while i + 1 < len(doc) and delimiter_pattern.fullmatch(doc[i + 1].text):
                    doc[i + 1].sent_start = False
                    i += 1
            else:
                doc[i].sent_start = False
            i += 1
    
        for token in doc[start:]:
            token.sent_start = False
    
        return doc
    
    
    @Language.factory("matcher")  # stateful
    def create_template_matcher(nlp, name):
        return TemplateMatcher(nlp.vocab)
    
    
    class TemplateMatcher:
        def __init__(self, vocab):
            # Define multiple patterns
            # NOTE *** modifications made here ***
            blar = {"ORTH": "blar"}
            patterns1 = [blar]
            patterns2 = [blar]
            patterns3 = [blar]
            patterns4 = [blar]
    
            Token.set_extension("templates", default=False, force=True)  # Register a new token extension to flag matched patterns
            self.matcher = Matcher(vocab)
            self.matcher.add("patterns1", [patterns1])
            self.matcher.add("patterns2", [patterns2])
            self.matcher.add("patterns3", [patterns3])
            self.matcher.add("patterns4", [patterns4])
    
        def __call__(self, doc):
            matches = self.matcher(doc)
            for match_id, start, end in matches:
                for token in doc[start:end]:
                    token._.templates = True
            return doc
    
    

    After running the python -m spacy init fill-config command...

    python -m spacy init fill-config config.cfg config.cfg --code custom_components.py
    

    We get the green checkmark.

    ✔ Auto-filled config with all values
    ✔ Saved config                                    
    config.cfg                                        
    You can now add your data and train your pipeline:
    python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
    

    You should be good to run python -m spacy train config.cfg --output config/ --code custom_componets.py now.

    References