Search code examples
named-entity-recognitionconll

Change Named Entity Recognition Format from ENAMEX to CoNLL


I have a dataset which is in ENAMEX format like this:

<ENAMEX TYPE="LOCATION">Italy</ENAMEX>'s business world was rocked by the announcement <TIMEX TYPE="DATE">last Thursday</TIMEX> that Mr. <ENAMEX TYPE=„PERSON">Verdi</ENAMEX> would leave his job as vicepresident of <ENAMEX TYPE="ORGANIZATION">Music Masters of Milan, Inc</ENAMEX> to become operations director of <ENAMEX TYPE="ORGANIZATION">Arthur Andersen</ENAMEX>.

I want to change it into CoNLL format:

Italy  LOCATION
's  O
business O
world  O
was  O
rocked  O
by  O
the  O
announcement  O
last  DATE
Thursday  DATE
...
.  O

How can I do that? Is there a standard script for such format conversion?


Solution

  • I wrote one myself that worked for me though is not heavily tested here:

    from __future__ import unicode_literals
    import os
    from os import path
    import re
    import os
    import re
    import en_core_web_sm #spacy
    
    # to convert formats such as <ENAMEX type="LOCATION">Italy</ENAMEX> is experiencing an economic boom.
    
    def xml_iter(file_):
        with open(file_, 'r') as fin:
            for line in fin:
                yield line.strip()
    
    
    
    def markupline2bio(line):
                #print(line.split('\t')[0])
            record = line.split('\t')[0]
            #print(record)
            #print(parse(record))
            #print(record[35:40], record[81:90])
            #tags = re.findall(r'<ENAMEX\s+TYPE=\"(.+?)\">(.+?)</ENAMEX>', record)
            prev_start = 0
            prev_end = 0
            all_tokens = []
            all_tags = []
            for f in re.finditer(r'<ENAMEX\s+TYPE=\"(.+?)\">(.+?)</ENAMEX>', record):
                #print(record[f.start(0):f.end(0)], f.start(0), f.end(0))
                annotations = re.findall(r'<ENAMEX\s+TYPE=\"(.+?)\">(.+?)</ENAMEX>', record[f.start(0):f.end(0)])
                before_text = record[prev_end:f.start(0)]
                prev_start, prev_end = f.start(0), f.end(0)
                for tok in nlp(before_text):
                    if str(tok).strip():
                        all_tokens.append(tok)
                        all_tags.append('O')
                for phrasetag in annotations:
                    tag, phrase = annotations[0]
                    tokens = nlp(phrase)
                    for entity_tok_index, tok in enumerate(tokens):
                        if str(tok).strip():
                            all_tokens.append(tok)
                            if entity_tok_index == 0:
                                all_tags.append("B-" + tag)
                            else:
                                all_tags.append("I-" + tag)
                        else:
                            entity_tok_index -= 1
    
            after_text = record[prev_end:]
            for tok in nlp(after_text):
                if str(tok).strip():
                    all_tokens.append(tok)
                    all_tags.append('O')
            return all_tokens, all_tags
    
    if __name__ == '__main__':
        data_dir = './data/indonesian_bert_all/Indonesian/ner/'
        xml_iterator = xml_iter(os.path.join(data_dir, 'data_train_ugm.txt'))
        output_file = os.path.join(data_dir, 'data_train_ugm.bio')
        #nlp = spacy.load("en_core_web_sm")
        nlp = en_core_web_sm.load()
        with open(output_file, 'w') as fout:
            for i, line in enumerate(xml_iterator):
                if i > 10:
                    #break
                    pass
                all_tokens, all_tags = markupline2bio(line.strip())
                #print(all_tokens)
                #print(all_tags)
                #print(line)
                for tok, tag in zip(all_tokens, all_tags):
                    #print(tok, tag)
                    fout.write(str(tok) + '\t' + tag)
                    fout.write('\n')
                fout.write('\n')