My program need to read a file with sentences and produce an output like that:
input: Ixé Maria. output: Ixé\PRON Maria\N-PR.
Until now, I wrote this, but the outfile gives me an empty textfile. (please, give me suggestions):
infile = open('corpus_test.txt', 'r', encoding='utf-8').read()
outfile = open('tag_test.txt', 'w', encoding='utf-8')
dicionario = {'mimbira': 'N',
'anama-itá': 'N-PL',
'Maria': 'N-PR',
'sumuara-kunhã': 'N-FEM',
'sumuara-kunhã-itá': 'N-FEM-PL',
'sapukaia-apigaua': 'N-MASC',
'sapukaia-apigaua-itá': 'N-MASC-PL',
'nhaã': 'DEM',
'nhaã-itá': 'DEM-PL',
'ne': 'POS',
'mukuĩ': 'NUM',
'muíri': 'QUANT',
'iepé': 'INDF',
'pirasua': 'A1',
'pusé': 'A2',
'ixé': 'PRON1',
'se': 'PRON2',
'. ;': 'PUNCT'
}
np_words = dicionario.keys()
np_tags = dicionario.values()
for line in infile.splitlines():
list_of_words = line.split()
if np_words in list_of_words:
tag_word = list_of_words.index(np_words)+1
word_tagged = list_of_words.insert(tag_word, f'\{np_tags}')
word_tagged = " ".join(word_tagged)
print(word_tagged, file=outfile)
outfile.close()
Starting simply with NLP makes it easier to understand and also to appreciate the more advanced systems.
This gives what you're looking for:
# Use 'with' so that the file is automatically closed when the 'with' ends.
with open('corpus_test.txt', 'r', encoding='utf-8') as f:
# splitlines is not a method, readlines is.
# infile will contain a list, where each item is a line.
# e.g. infile[0] = line 1.
infile = f.readlines()
dicionario = {
'Maria': 'N-PR',
'ixé': 'PRON1',
}
# Make a list to hold the new lines
outlines = []
for line in infile:
list_of_words = line.split()
new_line = ''
# 'if np_words in list_of_words' is asking too much of Python.
for word in list_of_words:
# todo: Dictionaries are case-sensitive, so ixé is different to Ixé.
if word in dicionario:
new_line += word + '\\' + dicionario[word] + ' '
else:
new_line += word + ' '
# Append the completed new line to the list and add a carriage return.
outlines.append(new_line.strip() + '\n')
with open('tag_test.txt', 'w', encoding='utf-8') as f:
f.writelines(outlines)