Search code examples
pythonfilefor-loopnlptext-files

How to perform same operation on multiple text files and save the output in different files using python?


I have written a code which extracts stop words from a text file and outputs two new text files. One file contains the stop words from that text file and another file contains the data without stop words. Now I have more than 100 text file in a folder, I would like to perform the same operation on all those file simultaneously.

For example there is a Folder A which contains 100 text file the code should be executed on all those text files simultaneously. The output should be two new text files such as 'Stop_Word_Consist_Filename.txt' and 'Stop_word_not_Filename.txt' which should be stored in a separate folder.That means for every 100 text files there will 200 output text files stored in a new folder. Please note the 'Filename' in both these output file is the actual name of the text file meaning 'Walmart.txt' should have 'Stop_Word_Consist_Walmart.txt' and 'Stop_word_not_Walmart.txt'. I did try few things and I know loop in involved giving the path directory but I didn't get any success.

Apologies for such a long question.

Following is the code for 1 file.

import numpy as np
import pandas as pd

# Pathes of source files and that for after-modifications
files_path = os.getcwd()
# another folder, your should create first to store files after modifications in
files_after_path = os.getcwd() + '/' + 'Stopwords_folder'
os.makedirs(files_after_path, exist_ok=True)
text_files = os.listdir(files_path)
data = pd.DataFrame(text_files)
data.columns = ["Review_text"]

import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def clean_text(df):
    all_reviews = list()
    #lines = df["Review_text"].values.tolist()
    lines = data.values.tolist()

    for text in lines:
        #text = text.lower()
        text = [word.lower() for word in text]

        pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text = pattern.sub('', str(text))
        
        emoji = re.compile("["
                           u"\U0001F600-\U0001FFFF"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        text = emoji.sub(r'', text)
        
        text = re.sub(r"i'm", "i am", text)
        text = re.sub(r"he's", "he is", text)
        text = re.sub(r"she's", "she is", text)
        text = re.sub(r"that's", "that is", text)        
        text = re.sub(r"what's", "what is", text)
        text = re.sub(r"where's", "where is", text) 
        text = re.sub(r"\'ll", " will", text)  
        text = re.sub(r"\'ve", " have", text)  
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"won't", "will not", text)
        text = re.sub(r"don't", "do not", text)
        text = re.sub(r"did't", "did not", text)
        text = re.sub(r"can't", "can not", text)
        text = re.sub(r"it's", "it is", text)
        text = re.sub(r"couldn't", "could not", text)
        text = re.sub(r"have't", "have not", text)
        
        text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text)
        tokens = word_tokenize(text)
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        stop_words = set(stopwords.words("english"))
        stop_words.discard("not")
        PS = PorterStemmer()
        words = [PS.stem(w) for w in words if not w in stop_words]
        words = ' '.join(words)
        all_reviews.append(words)
    return all_reviews,stop_words

for entry in data:
    #all_reviews , stop_words = clean_text(entry)
    for r in all_reviews: 
        if not r in stop_words: 
            appendFile = open(f'No_Stopwords{entry}.txt','a') 
            appendFile.write(" "+r) 
            appendFile.close() 
    
    for r in stop_words: 
        appendFile = open(f'Stop_Word_Consist{entry}.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
    all_reviews , stop_words = clean_text(entry)

UPDATE :

So I have made changes to the code. I did got two output files Stop_Word_Consist and No_Stop_word. But I am not getting the required Data inside. Meaning Stop_word consist does not have the stop words I am looking for. I am pretty sure I made some mistakes in indentation. I would appreciate the help.


Solution

  • You can use OS.listdir to get the number of text files, and use a for loop to run each time. To assign a name to the output file you can use an f-string in its creation so it looks like f'Stop_Word_Consist_{fileName}':

    for entry in OS.listdir(folder location):
        all_reviews , stop_words = clean_text(data_1)
        all_reviews[:]
    
    for r in all_reviews: 
        if not r in stop_words: 
        appendFile = open('Stop_Word_hrb02-phil-usa.txt.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
    
    for r in stop_words: 
        appendFile = open(f'Stop_Word_Consist{entry}.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close()