The resulting files are two very long one-element list where all the processed text put together. I tried to move the list.append command under the if else statement and I got a very huge list where every few words are lumped together followed by the same previous words with some new words added to them until I get a full sentence that I am after, then it starts to do the same with the next match. I am sure it can be solved with a better loop. I also tried to work with the resulting files but it's quite inefficient as I no longer have any basis to split them. is it possible that this is a result of the "or" operand in the written regular expression ?
import csv
import re
import string
import nltk
from nltk.tokenize import punkt, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import langid
h=SnowballStemmer("hungarian") # hungarian stemmer
stop_words=set(stopwords.words("hungarian")) # - {"Nem,nem"}
i=0.0
j=0.0
latin_counter=0.0
result=[]
result2=[]
tokenized_txt=[]
tokenized_txt_latin=[]
unstemmed_list=[]
auxlist=[]
stop_words_latin={'ab', 'ac', 'ad', 'adhic', 'aliqui', 'aliquis', 'an', 'ante', 'apud', 'at', 'atque',
'aut', 'autem', 'cum', 'cur', 'de', 'deinde', 'dum', 'ego', 'enim', 'ergo', 'es', 'est', 'et', 'etiam', 'etsi', 'ex', 'fio', 'haud',
'hic', 'iam', 'idem', 'igitur', 'ille', 'in', 'infra', 'inter', 'interim', 'ipse', 'is', 'ita', 'magis', 'modo',
'mox', 'nam', 'ne', 'nec', 'necque', 'neque', 'nisi', 'non', 'nos', 'o', 'ob', 'per', 'possum', 'post', 'pro', 'quae', 'quam', 'quare', 'qui',
'quia', 'quicumque', 'quidem', 'quilibet', 'quis', 'quisnam', 'quisquam', 'quisque', 'quisquis', 'quo', 'quoniam', 'sed', 'si', 'sic',
'sive', 'sub', 'sui', 'sum', 'super', 'suus', 'tam', 'tamen', 'trans', 'tu', 'tum', 'ubi', 'uel', 'uero'}
with open('data/onkology.csv', 'r') as csv_file:
csv_reader= csv.reader(csv_file ,delimiter=';')
exp=(r'[l L]u.r[o i]n\b|(\w)*peptyl\b|(\w)*lutamid\b')
for line in csv_reader:
i+= 1
for lineElem in line:
if (re.search(exp,lineElem) and len(lineElem)>80) :
result2.append(lineElem) # if we want to see what we matched
tst_txt=lineElem
j+=1
#if(i >= 10000):
# break
for listElem in result2:
k,_ =langid.classify(listElem)
if(k=='la'):
#print (tst_txt)
latin_counter+=1
words=word_tokenize(listElem)
# removing stop words
for w in words:
if w not in stop_words_latin:
#Stemming and add to a list
tokenized_txt_latin.append(w)
# removing punctuation
tokenized_txt_latin = [word for word in tokenized_txt_latin if word.isalpha()]
words=' '.join(tokenized_txt_latin) # rejoining tokens to form a string
else :
words=word_tokenize(listElem)
# removing stop words
for w in words:
if w not in stop_words:
#Stemming and add to a list
auxlist.append(w)
tokenized_txt.append(h.stem(w))
#unstemmed_list.append(words)
# removing punctuation
auxlist = [word for word in auxlist if word.isalpha()]
words2=' '.join(auxlist) # rejoining tokens to form a string
tokenized_txt = [word for word in tokenized_txt if word.isalpha()]
words=' '.join(tokenized_txt) # rejoining tokens to form a string
result.append(words)
unstemmed_list.append(words2)
print("Matching rate is :", (j/i) )
print(unstemmed_list ,"\n")
print(result,"\n")
# write results to a file
with open('listfile.txt', 'w') as filehandle:
for listitem in result:
filehandle.write('%s\n' % listitem)
with open('listfile_unstemmed.txt', 'w') as filehandle:
for listitem in unstemmed_list:
filehandle.write('%s\n' % listitem)
After running the code on a different machine (I migrated my project to google Colab) and comparing results I found out that this was a result of the memory overflow on the old machine and had nothing to do with the code itself.