Search code examples
pythonregexloopscsvnlp

How to split regex resulting list by new line after stemming and removing punctuation?


The resulting files are two very long one-element list where all the processed text put together. I tried to move the list.append command under the if else statement and I got a very huge list where every few words are lumped together followed by the same previous words with some new words added to them until I get a full sentence that I am after, then it starts to do the same with the next match. I am sure it can be solved with a better loop. I also tried to work with the resulting files but it's quite inefficient as I no longer have any basis to split them. is it possible that this is a result of the "or" operand in the written regular expression ?

import csv
import re 
import string
import nltk
from nltk.tokenize import punkt, word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import langid

h=SnowballStemmer("hungarian") # hungarian stemmer
stop_words=set(stopwords.words("hungarian")) # - {"Nem,nem"} 
i=0.0
j=0.0
latin_counter=0.0
result=[]
result2=[]
tokenized_txt=[]
tokenized_txt_latin=[]
unstemmed_list=[]
auxlist=[]

stop_words_latin={'ab', 'ac', 'ad', 'adhic', 'aliqui', 'aliquis', 'an', 'ante', 'apud', 'at', 'atque',
 'aut', 'autem', 'cum', 'cur', 'de', 'deinde', 'dum', 'ego', 'enim', 'ergo', 'es', 'est', 'et', 'etiam', 'etsi', 'ex', 'fio', 'haud', 
 'hic', 'iam', 'idem', 'igitur', 'ille', 'in', 'infra', 'inter', 'interim', 'ipse', 'is', 'ita', 'magis', 'modo',
 'mox', 'nam', 'ne', 'nec', 'necque', 'neque', 'nisi', 'non', 'nos', 'o', 'ob', 'per', 'possum', 'post', 'pro', 'quae', 'quam', 'quare', 'qui',
 'quia', 'quicumque', 'quidem', 'quilibet', 'quis', 'quisnam', 'quisquam', 'quisque', 'quisquis', 'quo', 'quoniam', 'sed', 'si', 'sic',
 'sive', 'sub', 'sui', 'sum', 'super', 'suus', 'tam', 'tamen', 'trans', 'tu', 'tum', 'ubi', 'uel', 'uero'}

with open('data/onkology.csv', 'r') as csv_file:
    csv_reader= csv.reader(csv_file  ,delimiter=';') 

    exp=(r'[l L]u.r[o i]n\b|(\w)*peptyl\b|(\w)*lutamid\b')
    for line in csv_reader:
            i+= 1
            for lineElem in line: 
               
                if  (re.search(exp,lineElem) and len(lineElem)>80) : 
                    result2.append(lineElem) # if we want to see what we matched 
                    tst_txt=lineElem
                    j+=1
                    
            #if(i >= 10000): 
             #   break


                
    for listElem in result2:
                k,_ =langid.classify(listElem) 

                if(k=='la'):
                    #print (tst_txt)
                    latin_counter+=1
                    words=word_tokenize(listElem)
                    # removing stop words 
                    for w in words:
                        if w not in stop_words_latin:
                            #Stemming and add to a list 
                            tokenized_txt_latin.append(w)
                    # removing punctuation 
                    tokenized_txt_latin = [word for word in tokenized_txt_latin if word.isalpha()]
                    words=' '.join(tokenized_txt_latin) # rejoining tokens to form a string 

                    



                else :
                    words=word_tokenize(listElem)
                    # removing stop words 
                    for w in words:
                        if w not in stop_words:
                            #Stemming and add to a list 
                            auxlist.append(w)
                            tokenized_txt.append(h.stem(w))
                            #unstemmed_list.append(words)
                    # removing punctuation 

                    auxlist = [word for word in auxlist if word.isalpha()]
                    words2=' '.join(auxlist) # rejoining tokens to form a string

                    tokenized_txt = [word for word in tokenized_txt if word.isalpha()]
                    words=' '.join(tokenized_txt) # rejoining tokens to form a string
                     

    result.append(words)
    unstemmed_list.append(words2) 


print("Matching rate is :",  (j/i) )

print(unstemmed_list ,"\n")
print(result,"\n")

# write results to a file 
with open('listfile.txt', 'w') as filehandle:
    for listitem in result:
        filehandle.write('%s\n' % listitem)

with open('listfile_unstemmed.txt', 'w') as filehandle:
    for listitem in unstemmed_list:
        filehandle.write('%s\n' % listitem)


Solution

  • After running the code on a different machine (I migrated my project to google Colab) and comparing results I found out that this was a result of the memory overflow on the old machine and had nothing to do with the code itself.