I am building a unit annotater based on the idea of MedaCy. First all basic unit types are set, which are then used to build more complex units. For example:
m/s² → m DISTANCE / s DURATION ² → m/s SPEED ² → m/s² ACCELERATION
For this purpose I changed part of the tokenization so that numbers are always seperated from alphabetic characters and such.
'< 2.0 m/s²' → ['<', '2.0', 'm', '/', 's', '²']
However, my current issue is that I can only achieve the last step (ACCELERATION) by merging the tokens whenever an entity is recognized. This results in the loss of feature and entity information of underlying tokens which I definetly want to avoid. Therefore I disabled the merging of tokens, but now I cannot achieve the last step of annotating entities like acceleration. This is because the matcher works token-based. As seen below the matcher is not able to detect the entity since it spans over multiple tokens. (Note that speed is annotated correctly over multiple tokens.)
[{'ENT_TYPE': 'speed'}, {'TEXT': {'REGEX': r'(^)?2|²'}}]
['<', '2.0', 'm', '/', 's', '²']
Adding all the possible token combinations to the acceleration matcher would be a none solution for me since it would interfere with the concept of building all units from bottom up.
Another solution which has come to my mind is using multiple entity rulers, since first the basic units would have to be tagged and then the subsequently the more complex one. However, it seems to run into the same tokenization issue and furthermore I get the error message that there can only be one entity ruler. 'entity_ruler' already exists in pipeline. Existing names: ['entity_ruler']
In summary I want to annotate entities using entities spanning over multiple tokens. Hence token based matching does not work.
This is called right after creating a blank spaCy model.
def remove_units(nlp):
suffixes = list(nlp.Defaults.suffixes)
UNITS = '(?<=[0-9])(?:km|km²|km³|m|m²|m³|dm|dm²|dm³|cm|cm²|cm³|mm|mm²|mm³|ha|µm|nm|yd|in|ft|kg|g|mg|µg|t|lb|oz|m/s|km/h|kmh|mph|hPa|Pa|mbar|mb|MB|kb|KB|gb|GB|tb|TB|T|G|M|K|%|км|км²|км³|м|м²|м³|дм|дм²|дм³|см|см²|см³|мм|мм²|мм³|нм|кг|г|мг|м/с|км/ч|кПа|Па|мбар|Кб|КБ|кб|Мб|МБ|мб|Гб|ГБ|гб|Тб|ТБ|тбكم|كم²|كم³|م|م²|م³|سم|سم²|سم³|مم|مم²|مم³|كم|غرام|جرام|جم|كغ|ملغ|كوب|اكواب)'
suffixes.remove(UNITS)
suffixes = nlp.Defaults.suffixes + (r'(?<=[0-9])(?:[A-z]+[^.,:;]*)', # For splitting numbers from alphabetic characters
r'/',) # For splitting backslashes from other characters 'm/' -> 'm','/'
suffix_regex = spacy.util.compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search = suffix_regex.search
prefixes = list(nlp.Defaults.prefixes)
prefixes = nlp.Defaults.prefixes + (r'/', ) # For splitting backslashes from other characters '/s' -> '/','s'
prefix_regex = spacy.util.compile_prefix_regex(prefixes)
nlp.tokenizer.prefix_search = prefix_regex.search
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.tokens import Token
import re
class UnitAnnotator(object):
name="unit_annotator"
dependencies = []
def __init__(self, nlp):
self.nlp = nlp
Token.set_extension('is_duration_unit', default=False, force=True)
Token.set_extension('is_memory_unit', default=False, force=True)
Token.set_extension('is_fraction_unit', default=False, force=True)
Token.set_extension('is_angle_unit', default=False, force=True)
Token.set_extension('is_distance_unit', default=False, force=True)
Token.set_extension('is_pressure_unit', default=False, force=True)
Token.set_extension('is_voltage_unit', default=False, force=True)
Token.set_extension('is_speed_unit', default=False, force=True)
Token.set_extension('is_acceleration_unit', default=False, force=True)
Token.set_extension('is_frequency_unit', default=False, force=True)
Token.set_extension('is_volume_unit', default=False, force=True)
Token.set_extension('is_torque_unit', default=False, force=True)
Token.set_extension('is_operator', default=False, force=True)
Token.set_extension('is_measurement', default=False, force=True)
# For splitting equations first
self.split_matcher1 = Matcher(nlp.vocab)
self.split_matcher1.add('SPLIT1', None,
[{'TEXT': {'REGEX': r'[/\>\<]'}}], # 'km/h' -> 'km','/','h'
)
self.split_matcher2 = Matcher(nlp.vocab)
self.split_matcher2.add('SPLIT2', None,
[{'TEXT': {'REGEX': r'[²³]'}}], # 'm²' -> 'm','²'
[{'TEXT': {'REGEX': r'\)'}}], # '8)' -> '8',')'
# TODO: FIX splitting of '(mis)interventions'
)
self.duration_matcher = Matcher(nlp.vocab)
self.duration_matcher.add('UNIT_OF_duration', None,
[{'ORTH': 'ms'}],
[{'LOWER': 'msec'}],
[{'LOWER': 'milisecond'}],
[{'LOWER': 'miliseconds'}],
[{'ORTH': 's'}],
[{'LOWER': 'sec'}],
[{'LOWER': 'second'}],
[{'LOWER': 'seconds'}],
[{'LOWER': 'min'}],
[{'LOWER': 'mins'}],
[{'LOWER': 'minute'}],
[{'LOWER': 'minutes'}],
[{'ORTH': 'h'}],
[{'LOWER': 'hour'}],
[{'LOWER': 'hours'}]
)
self.memory_matcher = Matcher(nlp.vocab)
self.memory_matcher.add('UNIT_OF_MEMORY', None,
[{'LOWER': 'kb'}],
[{'LOWER': 'kbs'}],
[{'LOWER': 'kbit'}],
[{'LOWER': 'kbits'}],
[{'LOWER': 'mb'}],
[{'LOWER': 'mbs'}],
[{'LOWER': 'mbit'}],
[{'LOWER': 'mbits'}],
[{'LOWER': 'gb'}],
[{'LOWER': 'gbs'}],
[{'LOWER': 'gbit'}],
[{'LOWER': 'gbits'}],
[{'LOWER': 'tb'}],
[{'LOWER': 'tbs'}],
[{'LOWER': 'bit'}],
[{'LOWER': 'bits'}],
[{'LOWER': 'byte'}],
[{'LOWER': 'bytes'}],
[{'LOWER': 'kilobyte'}],
[{'LOWER': 'kilobytes'}],
[{'LOWER': 'megabyte'}],
[{'LOWER': 'megabytes'}],
[{'LOWER': 'gigabyte'}],
[{'LOWER': 'gigabytes'}],
[{'LOWER': 'terrabyte'}],
[{'LOWER': 'terrabytes'}],
)
self.fraction_matcher = Matcher(nlp.vocab)
self.fraction_matcher.add('UNIT_OF_FRACTION', None,
[{'ORTH': '%'}],
[{'LOWER': 'percent'}],
[{'LOWER': 'per'}, {'LOWER': 'cent'}]
)
self.angle_matcher = Matcher(nlp.vocab)
self.angle_matcher.add('UNIT_OF_ANGLE', None,
[{'LOWER': '°'}],
[{'LOWER': '°c'}],
[{'LOWER': 'deg'}],
[{'LOWER': 'degs'}],
[{'LOWER': 'degree'}],
[{'LOWER': 'degrees'}],
)
self.distance_matcher = Matcher(nlp.vocab)
self.distance_matcher.add('UNIT_OF_DISTANCE', None,
[{'ORTH': 'nm'}],
[{'LOWER': 'nanometer'}],
[{'LOWER': 'nanometers'}],
[{'ORTH': 'µm'}],
[{'LOWER': 'micrometer'}],
[{'LOWER': 'mircometers'}],
[{'ORTH': 'mm'}],
[{'LOWER': 'milimeter'}],
[{'LOWER': 'milimeters'}],
[{'ORTH': 'cm'}],
[{'LOWER': 'cendurationter'}],
[{'LOWER': 'cendurationters'}],
[{'ORTH': 'm'}],
[{'LOWER': 'meter'}],
[{'LOWER': 'meters'}],
[{'ORTH': 'km'}],
[{'LOWER': 'kilometer'}],
[{'LOWER': 'kilometers'}],
[{'LOWER': 'zoll'}],
)
self.pressure_matcher = Matcher(nlp.vocab)
self.pressure_matcher.add('UNIT_OF_PRESSURE', None,
[{'LOWER': 'bar'}] # Maybe add F/A
)
self.voltage_matcher = Matcher(nlp.vocab)
self.voltage_matcher.add('UNIT_OF_VOLTAGE', None,
[{'ORTH': 'V'}],
[{'lower': 'volt'}],
)
self.speed_matcher = Matcher(nlp.vocab)
self.speed_matcher.add('UNIT_OF_SPEED', None,
[{'ENT_TYPE': 'distance'}, {'LOWER': {'REGEX': r'/|p'}}, {'ENT_TYPE': 'duration'}]
)
self.acceleration_matcher = Matcher(nlp.vocab)
self.acceleration_matcher.add('UNIT_OF_ACCELERATION', None,
[{'ENT_TYPE': 'speed'}, {'TEXT': {'REGEX': r'(^)?2|²'}}]
)
self.frequency_matcher = Matcher(nlp.vocab)
self.frequency_matcher.add('UNIT_OF_FREQUENCY', None,
[{'LOWER': 'hz'}],
[{'LOWER': 'herz'}], # common misspelling
[{'LOWER': 'hertz'}],
[{'LOWER': '1'}, {'ORTH': '/'}, {'ENT_TYPE': 'duration'}]
)
self.volume_matcher = Matcher(nlp.vocab)
self.volume_matcher.add('UNIT_OF_VOLUME', None,
[{'LOWER': 'l'}],
[{'LOWER': 'liter'}],
[{'ENT_TYPE': 'distance'}, {'TEXT': {'REGEX': r'(^)?3|³'}}]
)
self.torque_matcher = Matcher(nlp.vocab)
self.torque_matcher.add('UNIT_OF_TORQUE', None,
[{'ORTH': 'Nm'}],
[{'LOWER': 'newtonmeter'}]
)
# TODO: RPM MATCHER
self.operator_matcher = Matcher(nlp.vocab)
self.operator_matcher.add('OPERATOR', None, # For now only < and >
[{'ORTH': '<'}, {'LIKE_NUM': True}],
[{'ORTH': '>'}, {'LIKE_NUM': True}],
[{'ORTH': '<'}, {'ORTH': '='}, {'LIKE_NUM': True}],
[{'ORTH': '>'}, {'ORTH': '='}, {'LIKE_NUM': True}],
[{'ORTH': '+'}, {'ORTH': '/'}, {'LIKE_NUM': True}], # LIKE_NUM already includes + and -
)
self.measurement_matcher = Matcher(nlp.vocab)
self.measurement_matcher.add('MEASUREMENT', None,
[{'LIKE_NUM': True}, {'ENT_TYPE': 'duration'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'memory'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'fraction'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'angle'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'distance'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'pressure'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'voltage'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'speed'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'acceleration'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'frequency'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'volume'}],
[{'LIKE_NUM': True}, {'ENT_TYPE': 'torque'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'duration'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'memory'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'fraction'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'angle'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'distance'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'pressure'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'voltage'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'speed'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'acceleration'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'frequency'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'volume'}],
[{'ENT_TYPE': 'operator'},{'ENT_TYPE': 'torque'}],
# TODO: 20 ... 30 UNIT, 20 to 30 UNIT, 20 of 60 UNIT
)
def __call__(self, doc):
nlp = self.nlp
# Split tokens containg a backslash 'km/h' -> 'km','/','h'
with doc.retokenize() as retokenizer:
matches1 = self.split_matcher1(doc)
for match_id, start, end in matches1:
span = Span(doc, start, end)
if len(span.text) > 1:
if '/' in span.text:
split = re.split('(/)', span.text)
if '>' in span.text:
split = re.split('(\>)', span.text)
if '<' in span.text:
split = re.split('(\<)', span.text)
heads = [(doc[start], i) for i,_ in enumerate(split)]
retokenizer.split(doc[start], split, heads=heads)
# Split tokens containg ')²³'
with doc.retokenize() as retokenizer:
matches2 = self.split_matcher2(doc)
for match_id, start, end in matches2:
span = Span(doc, start, end)
if len(span.text) > 1:
split = [x for x in span.text]
heads = [(doc[start], i) for i,_ in enumerate(split)]
retokenizer.split(doc[start], split, heads=heads)
def annotate(matcher, unit_type: str, attribute):
with doc.retokenize() as retokenizer:
#match and tag units
matches = matcher(doc)
entities = list(doc.ents)
add_flag = True
for match_id, start, end in matches:
span = Span(doc, start, end, label=unit_type)
for token in span:
setattr(token._, attribute, True)
try:
if len(span) > 1:
#retokenizer.merge(span)
pass
except ValueError:
pass
for e in entities[:]:
r_e = range(e.start+1,e.end+1)
r_n = range(start+1,end+1)
# Remove smaller entities which would overlap with the new one
if (end-start > e.end-e.start and (start+1 in r_e or end in r_e)) or (start < e.start and end > e.end):
entities.remove(e)
continue
# Check if entity to be added would overlap with an existing bigger one
if (e.end-e.start > end-start and (e.start+1 in r_n or e.end in r_n)) or (e.start < start and e.end > end):
add_flag = False
if(add_flag):
entities.append(span)
add_flag = True
doc.ents = entities
annotate(self.duration_matcher, 'duration', 'is_duration_unit')
annotate(self.memory_matcher, 'memory', 'is_memory_unit')
annotate(self.fraction_matcher, 'fraction', 'is_fraction_unit')
annotate(self.angle_matcher, 'angle', 'is_angle_unit')
annotate(self.distance_matcher, 'distance', 'is_distance_unit')
annotate(self.pressure_matcher, 'pressure', 'is_pressure_unit')
annotate(self.voltage_matcher, 'voltage', 'is_voltage_unit')
annotate(self.speed_matcher, 'speed', 'is_speed_unit')
annotate(self.acceleration_matcher, 'acceleration', 'is_acceleration_unit')
annotate(self.frequency_matcher, 'frequency', 'is_frequency_unit')
annotate(self.volume_matcher, 'volume', 'is_volume_unit')
annotate(self.torque_matcher, 'torque', 'is_torque_unit')
annotate(self.operator_matcher, 'operator', 'is_operator')
annotate(self.measurement_matcher, 'measurement', 'is_measurement')
return doc
I have since found a rather obvious solution.
['<', '2.0', 'm', '/', 's', '²']
m SPEED / SPEED s SPEED
Those are three tokens of the entity type SPEED. Therefore it is enough to use the 'One or more' quantifier.
[{'ENT_TYPE': 'speed', 'OP': '+'}, {'TEXT': {'REGEX': r'(^)?2|²'}}]
In this solution, the entity types are still overwritten, but the underlying units are still stored as features on each token.