python text nltk named-entity-recognition

How to read a text and label each word of it in Python

data = ("Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country. Many people have been killed that day.",
        {"entities": [(48, 54, 'Category 1'), (77, 81, 'Category 1'), (111, 118, 'Category 2'), (150, 173, 'Category 3')]})

data[1]['entities'][0] = (48, 54, 'Category 1') stands for (start_offset, end_offset, entity).

I want to read each word of data[0] in a sequential manner and tag each word according to data[1] entities. I am expecting to have as final output,

{
'Thousands': 'O', 
'of': 'O',
'demonstrators': 'O',
'have': 'O',
'marched': 'O',
'through': 'O',
'London': 'S-1',
'to': 'O', 
'protest': 'O', 
'the': 'O', 
'war': 'O', 
'in': 'O', 
'Iraq': 'S-1',
'and': 'O' 
'demand': 'O', 
'the': 'O', 
'withdrawal': 'O', 
'of': 'O', 
'British': 'S-2', 
'troops': 'O', 
'from': 'O',
'that': 'O', 
'country': 'O',
'.': 'O',
'Many': 'O', 
'people': 'S-3', 
'have': 'B-3', 
'been': 'B-3', 
'killed': 'E-3', 
'that': 'O', 
'day': 'O',
'.': 'O'
}

Here, 'O' stands for 'OutOfEntity', 'S' stands for 'Start', 'B' stands for 'Between', and 'E' stands for 'End' and are unique for every given text.

I tried the following:

def ner(data):
    entities = {}
    offsets = data[1]['entities']
    for entity in offsets:
        entities[data[0][int(entity[0]):int(entity[1])]] = re.findall('[0-9]+', entity[2])[0]
    
    tags = []
    for key, value in entities.items():
        entity = key.split()
        if len(entity) > 1:
            bEntity = entity[1:-1]
            tags.append((entity[0], 'S-'+value))
            for item in bEntity:
                tags.append((item, 'B-'+value))
            tags.append((entity[-1], 'E-'+value))
        else:
            tags.append((entity[0], 'S-'+value))
    
    tokens = nltk.word_tokenize(data[0])
    OTokens = [(token, 'O') for token in tokens if token not in [token[0] for token in tags]]
    for token in OTokens:
        tags.append(token)
    
    return tags

But the above function does not work properly in case I have some words that are the same as those in data[1]['entities'] offsets but not part of the offsets will be ignored instead they should be labeled as 'O'.

Solution

Not sure if the final format is json, yet below is an example to process the data into the print format, i.e.

# sample output
'''
{
'Thousands': 'O',
'of': 'O',
'demonstrators': 'O',
'have': 'O',
'marched': 'O',
'through': 'O',
'London': 'S-1',
'to': 'O',
'protest': 'O',
'the': 'O',
'war': 'O',
'in': 'O',
'Iraq': 'S-1',
'and': 'O',
'demand': 'O',
'the': 'O',
'withdrawal': 'O',
'of': 'O',
'British': 'S-2',
'troops': 'O',
'from': 'O',
'that': 'O',
'country.': 'O',
'Many': 'O',
'people': 'S-3',
'have': 'B-3',
'been': 'B-3',
'killed': 'E-3',
'that': 'O',
'day.': 'O'
}
'''
# sample code
data = ("Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country. Many people have been killed that day.",
        {"entities": [(48, 54, 'Category 1'), (77, 81, 'Category 1'), (111, 118, 'Category 2'), (150, 173, 'Category 3')]})

print("{")
pre = 0
for i in (data[1].values())[0]:
        a = data[0][i[0]:i[1]].split()
        t = pre + i[1]
        #print(pre, i[0])
        b = data[0][pre:i[0]].split()
        for j in b:
                print("'%s': '%s'," % (j, "O"))
        pre = i[1]
        for j in range(len(a)): 
                if j == 0:
                        print("'%s': '%s-%s'," % (a[j], "S", i[2][-1]))
                elif j == len(a) - 1:
                        print("'%s': '%s-%s'," % (a[j], "E", i[2][-1]))
                else:
                        print("'%s': '%s-%s'," % (a[j], "B", i[2][-1]))
#print(i[1], las)
las = len(data[0])
c = data[0][i[1]:las].split()
for j in range(len(c)):
        if j == len(c) - 1:
                print("'%s': '%s'" % (c[j], "O"))
        else:
                print("'%s': '%s'," % (c[j], "O"))
print("}")