Annotating Entities from multiple token-spanning Entities

I am building a unit annotater based on the idea of MedaCy. First all basic unit types are set, which are then used to build more complex units. For example:

m/s² → m DISTANCE / s DURATION ² → m/s SPEED ² → m/s² ACCELERATION

For this purpose I changed part of the tokenization so that numbers are always seperated from alphabetic characters and such.

'< 2.0 m/s²' → ['<', '2.0', 'm', '/', 's', '²']

However, my current issue is that I can only achieve the last step (ACCELERATION) by merging the tokens whenever an entity is recognized. This results in the loss of feature and entity information of underlying tokens which I definetly want to avoid. Therefore I disabled the merging of tokens, but now I cannot achieve the last step of annotating entities like acceleration. This is because the matcher works token-based. As seen below the matcher is not able to detect the entity since it spans over multiple tokens. (Note that speed is annotated correctly over multiple tokens.)

[{'ENT_TYPE': 'speed'}, {'TEXT': {'REGEX': r'(^)?2|²'}}]
['<', '2.0', 'm', '/', 's', '²']

Adding all the possible token combinations to the acceleration matcher would be a none solution for me since it would interfere with the concept of building all units from bottom up.

Another solution which has come to my mind is using multiple entity rulers, since first the basic units would have to be tagged and then the subsequently the more complex one. However, it seems to run into the same tokenization issue and furthermore I get the error message that there can only be one entity ruler. 'entity_ruler' already exists in pipeline. Existing names: ['entity_ruler']

In summary I want to annotate entities using entities spanning over multiple tokens. Hence token based matching does not work.

This is called right after creating a blank spaCy model.

def remove_units(nlp):
    suffixes = list(nlp.Defaults.suffixes)
    UNITS = '(?<=[0-9])(?:km|km²|km³|m|m²|m³|dm|dm²|dm³|cm|cm²|cm³|mm|mm²|mm³|ha|µm|nm|yd|in|ft|kg|g|mg|µg|t|lb|oz|m/s|km/h|kmh|mph|hPa|Pa|mbar|mb|MB|kb|KB|gb|GB|tb|TB|T|G|M|K|%|км|км²|км³|м|м²|м³|дм|дм²|дм³|см|см²|см³|мм|мм²|мм³|нм|кг|г|мг|м/с|км/ч|кПа|Па|мбар|Кб|КБ|кб|Мб|МБ|мб|Гб|ГБ|гб|Тб|ТБ|тбكم|كم²|كم³|م|م²|م³|سم|سم²|سم³|مم|مم²|مم³|كم|غرام|جرام|جم|كغ|ملغ|كوب|اكواب)'
    suffixes.remove(UNITS)
    suffixes = nlp.Defaults.suffixes + (r'(?<=[0-9])(?:[A-z]+[^.,:;]*)', # For splitting numbers from alphabetic characters
                                        r'/',) # For splitting backslashes from other characters 'm/' -> 'm','/'
    suffix_regex = spacy.util.compile_suffix_regex(suffixes)
    nlp.tokenizer.suffix_search = suffix_regex.search

    prefixes = list(nlp.Defaults.prefixes)
    prefixes = nlp.Defaults.prefixes + (r'/', ) # For splitting backslashes from other characters '/s' -> '/','s'
    prefix_regex = spacy.util.compile_prefix_regex(prefixes)
    nlp.tokenizer.prefix_search = prefix_regex.search

from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.tokens import Token
import re


class UnitAnnotator(object):
    name="unit_annotator"
    dependencies = []

    def __init__(self, nlp):
        self.nlp = nlp

        Token.set_extension('is_duration_unit', default=False, force=True)
        Token.set_extension('is_memory_unit', default=False, force=True)
        Token.set_extension('is_fraction_unit', default=False, force=True)
        Token.set_extension('is_angle_unit', default=False, force=True)
        Token.set_extension('is_distance_unit', default=False, force=True)
        Token.set_extension('is_pressure_unit', default=False, force=True)
        Token.set_extension('is_voltage_unit', default=False, force=True)
        Token.set_extension('is_speed_unit', default=False, force=True)
        Token.set_extension('is_acceleration_unit', default=False, force=True)
        Token.set_extension('is_frequency_unit', default=False, force=True)
        Token.set_extension('is_volume_unit', default=False, force=True)
        Token.set_extension('is_torque_unit', default=False, force=True)
        Token.set_extension('is_operator', default=False, force=True)
        Token.set_extension('is_measurement', default=False, force=True)

        # For splitting equations first
        self.split_matcher1 = Matcher(nlp.vocab)
        self.split_matcher1.add('SPLIT1', None,
            [{'TEXT': {'REGEX': r'[/\>\<]'}}], # 'km/h' -> 'km','/','h'

        )
        self.split_matcher2 = Matcher(nlp.vocab)
        self.split_matcher2.add('SPLIT2', None,
            [{'TEXT': {'REGEX': r'[²³]'}}], # 'm²' -> 'm','²'
            [{'TEXT': {'REGEX': r'\)'}}], # '8)' -> '8',')'
            # TODO: FIX splitting of '(mis)interventions'
        )       


        self.duration_matcher = Matcher(nlp.vocab)
        self.duration_matcher.add('UNIT_OF_duration', None,
                            [{'ORTH': 'ms'}],
                            [{'LOWER': 'msec'}],
                            [{'LOWER': 'milisecond'}],
                            [{'LOWER': 'miliseconds'}],                            
                            [{'ORTH': 's'}],
                            [{'LOWER': 'sec'}],
                            [{'LOWER': 'second'}],
                            [{'LOWER': 'seconds'}],  
                            [{'LOWER': 'min'}],
                            [{'LOWER': 'mins'}],
                            [{'LOWER': 'minute'}],
                            [{'LOWER': 'minutes'}],                            
                            [{'ORTH': 'h'}],
                            [{'LOWER': 'hour'}],
                            [{'LOWER': 'hours'}]
        )

        self.memory_matcher = Matcher(nlp.vocab)
        self.memory_matcher.add('UNIT_OF_MEMORY', None,
                            [{'LOWER': 'kb'}],
                            [{'LOWER': 'kbs'}],
                            [{'LOWER': 'kbit'}],
                            [{'LOWER': 'kbits'}],
                            [{'LOWER': 'mb'}],
                            [{'LOWER': 'mbs'}],
                            [{'LOWER': 'mbit'}],
                            [{'LOWER': 'mbits'}],
                            [{'LOWER': 'gb'}],
                            [{'LOWER': 'gbs'}],
                            [{'LOWER': 'gbit'}],
                            [{'LOWER': 'gbits'}],
                            [{'LOWER': 'tb'}],
                            [{'LOWER': 'tbs'}],
                            [{'LOWER': 'bit'}],
                            [{'LOWER': 'bits'}],
                            [{'LOWER': 'byte'}],
                            [{'LOWER': 'bytes'}],
                            [{'LOWER': 'kilobyte'}],
                            [{'LOWER': 'kilobytes'}],
                            [{'LOWER': 'megabyte'}],
                            [{'LOWER': 'megabytes'}],
                            [{'LOWER': 'gigabyte'}],
                            [{'LOWER': 'gigabytes'}],
                            [{'LOWER': 'terrabyte'}],
                            [{'LOWER': 'terrabytes'}],                        
        )

        self.fraction_matcher = Matcher(nlp.vocab)
        self.fraction_matcher.add('UNIT_OF_FRACTION', None,
                            [{'ORTH': '%'}],
                            [{'LOWER': 'percent'}],
                            [{'LOWER': 'per'}, {'LOWER': 'cent'}]
        )

        self.angle_matcher = Matcher(nlp.vocab)
        self.angle_matcher.add('UNIT_OF_ANGLE', None,
                            [{'LOWER': '°'}],
                            [{'LOWER': '°c'}],
                            [{'LOWER': 'deg'}],
                            [{'LOWER': 'degs'}],
                            [{'LOWER': 'degree'}],
                            [{'LOWER': 'degrees'}],
        )

        self.distance_matcher = Matcher(nlp.vocab)
        self.distance_matcher.add('UNIT_OF_DISTANCE', None,
                            [{'ORTH': 'nm'}],    
                            [{'LOWER': 'nanometer'}],
                            [{'LOWER': 'nanometers'}],
                            [{'ORTH': 'µm'}],    
                            [{'LOWER': 'micrometer'}],
                            [{'LOWER': 'mircometers'}],
                            [{'ORTH': 'mm'}],    
                            [{'LOWER': 'milimeter'}],
                            [{'LOWER': 'milimeters'}],
                            [{'ORTH': 'cm'}], 
                            [{'LOWER': 'cendurationter'}],
                            [{'LOWER': 'cendurationters'}],
                            [{'ORTH': 'm'}],
                            [{'LOWER': 'meter'}],
                            [{'LOWER': 'meters'}],
                            [{'ORTH': 'km'}],
                            [{'LOWER': 'kilometer'}],
                            [{'LOWER': 'kilometers'}],
                            [{'LOWER': 'zoll'}],                        
        )

        self.pressure_matcher = Matcher(nlp.vocab)
        self.pressure_matcher.add('UNIT_OF_PRESSURE', None,
                            [{'LOWER': 'bar'}] # Maybe add F/A
        )

        self.voltage_matcher = Matcher(nlp.vocab)
        self.voltage_matcher.add('UNIT_OF_VOLTAGE', None,
                            [{'ORTH': 'V'}],
                            [{'lower': 'volt'}],
        )        

        self.speed_matcher = Matcher(nlp.vocab)
        self.speed_matcher.add('UNIT_OF_SPEED', None,
                            [{'ENT_TYPE': 'distance'}, {'LOWER': {'REGEX': r'/|p'}}, {'ENT_TYPE': 'duration'}]
        )

        self.acceleration_matcher = Matcher(nlp.vocab)
        self.acceleration_matcher.add('UNIT_OF_ACCELERATION', None,
                            [{'ENT_TYPE': 'speed'}, {'TEXT': {'REGEX': r'(^)?2|²'}}]
        )

        self.frequency_matcher = Matcher(nlp.vocab)
        self.frequency_matcher.add('UNIT_OF_FREQUENCY', None,
                            [{'LOWER': 'hz'}],
                            [{'LOWER': 'herz'}], # common misspelling
                            [{'LOWER': 'hertz'}],
                            [{'LOWER': '1'}, {'ORTH': '/'}, {'ENT_TYPE': 'duration'}]
        )

        self.volume_matcher = Matcher(nlp.vocab)
        self.volume_matcher.add('UNIT_OF_VOLUME', None,
                            [{'LOWER': 'l'}],
                            [{'LOWER': 'liter'}],
                            [{'ENT_TYPE': 'distance'}, {'TEXT': {'REGEX': r'(^)?3|³'}}]
        )

        self.torque_matcher = Matcher(nlp.vocab)
        self.torque_matcher.add('UNIT_OF_TORQUE', None,
                            [{'ORTH': 'Nm'}],
                            [{'LOWER': 'newtonmeter'}]
        )

        # TODO: RPM MATCHER

        self.operator_matcher = Matcher(nlp.vocab)
        self.operator_matcher.add('OPERATOR', None, # For now only < and >
                            [{'ORTH': '<'}, {'LIKE_NUM': True}],
                            [{'ORTH': '>'}, {'LIKE_NUM': True}],
                            [{'ORTH': '<'}, {'ORTH': '='}, {'LIKE_NUM': True}],
                            [{'ORTH': '>'}, {'ORTH': '='}, {'LIKE_NUM': True}],
                            [{'ORTH': '+'}, {'ORTH': '/'}, {'LIKE_NUM': True}], # LIKE_NUM already includes + and -
        )

        self.measurement_matcher = Matcher(nlp.vocab)
        self.measurement_matcher.add('MEASUREMENT', None,
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'duration'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'memory'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'fraction'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'angle'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'distance'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'pressure'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'voltage'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'speed'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'acceleration'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'frequency'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'volume'}],
                            [{'LIKE_NUM': True}, {'ENT_TYPE': 'torque'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'duration'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'memory'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'fraction'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'angle'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'distance'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'pressure'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'voltage'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'speed'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'acceleration'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'frequency'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'volume'}],
                            [{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'torque'}],
                            # TODO: 20 ... 30 UNIT, 20 to 30 UNIT, 20 of 60 UNIT
        )

    def __call__(self, doc):
        nlp = self.nlp

        # Split tokens containg a backslash 'km/h' -> 'km','/','h'
        with doc.retokenize() as retokenizer:    
            matches1 = self.split_matcher1(doc)
            for match_id, start, end in matches1:
                span = Span(doc, start, end)       
                if len(span.text) > 1:
                    if '/' in span.text:
                        split = re.split('(/)', span.text)
                    if '>' in span.text:
                        split = re.split('(\>)', span.text)
                    if '<' in span.text:
                        split = re.split('(\<)', span.text)
                    heads = [(doc[start], i) for i,_ in enumerate(split)]
                    retokenizer.split(doc[start], split, heads=heads)
        # Split tokens containg ')²³'
        with doc.retokenize() as retokenizer:
            matches2 = self.split_matcher2(doc)
            for match_id, start, end in matches2:
                span = Span(doc, start, end)       
                if len(span.text) > 1:
                    split = [x for x in span.text]
                    heads = [(doc[start], i) for i,_ in enumerate(split)]
                    retokenizer.split(doc[start], split, heads=heads)

        def annotate(matcher, unit_type: str, attribute):
            with doc.retokenize() as retokenizer:
                #match and tag units
                matches = matcher(doc)
                entities = list(doc.ents)
                add_flag = True
                for match_id, start, end in matches:                    
                    span = Span(doc, start, end, label=unit_type)
                    for token in span:
                        setattr(token._, attribute, True)                   
                    try:
                        if len(span) > 1:
                            #retokenizer.merge(span)
                            pass
                    except ValueError:
                        pass

                    for e in entities[:]:
                        r_e = range(e.start+1,e.end+1)
                        r_n = range(start+1,end+1)
                        # Remove smaller entities which would overlap with the new one
                        if (end-start > e.end-e.start and (start+1 in r_e or end in r_e)) or (start < e.start and end > e.end):
                            entities.remove(e)
                            continue
                        # Check if entity to be added would overlap with an existing bigger one 
                        if (e.end-e.start > end-start and (e.start+1 in r_n or e.end in r_n)) or (e.start < start and e.end > end): 
                            add_flag = False

                    if(add_flag):
                        entities.append(span)

                    add_flag = True

                doc.ents = entities

        annotate(self.duration_matcher, 'duration', 'is_duration_unit')
        annotate(self.memory_matcher, 'memory', 'is_memory_unit')
        annotate(self.fraction_matcher, 'fraction', 'is_fraction_unit')        
        annotate(self.angle_matcher, 'angle', 'is_angle_unit')
        annotate(self.distance_matcher, 'distance', 'is_distance_unit')
        annotate(self.pressure_matcher, 'pressure', 'is_pressure_unit')
        annotate(self.voltage_matcher, 'voltage', 'is_voltage_unit')
        annotate(self.speed_matcher, 'speed', 'is_speed_unit')
        annotate(self.acceleration_matcher, 'acceleration', 'is_acceleration_unit')
        annotate(self.frequency_matcher, 'frequency', 'is_frequency_unit')
        annotate(self.volume_matcher, 'volume', 'is_volume_unit')
        annotate(self.torque_matcher, 'torque', 'is_torque_unit')
        annotate(self.operator_matcher, 'operator', 'is_operator')
        annotate(self.measurement_matcher, 'measurement', 'is_measurement')

        return doc

Solution

I have since found a rather obvious solution.

['<', '2.0', 'm', '/', 's', '²']

m SPEED / SPEED s SPEED

Those are three tokens of the entity type SPEED. Therefore it is enough to use the 'One or more' quantifier.

[{'ENT_TYPE': 'speed', 'OP': '+'}, {'TEXT': {'REGEX': r'(^)?2|²'}}]

In this solution, the entity types are still overwritten, but the underlying units are still stored as features on each token.