Search code examples
pythonnlp

Filtering stop words out of a multiple text files (using a list of stop words)


I have a folder named cleaned_texts. The folder contains text files(a.txt, b.txt, c.txt etc) and each text file contains tokenized words in this format:['Rise', 'of', 'e-health', 'and', 'its', 'Germany', 'dollar'].

Example:

a.txt contains ['Rise', 'of', 'e-health', 'and', 'its', 'Thailand', 'YEN', 'India'] and

b.txt contains ['PESO', 'Man', 'development', 'never', 'Japan', 'year', 'date', 'Canada'].

I also have another folder named StopWords which also contains text files and each text file contains a stop word. The text files are named in this format (currency.txt, names.txt, geographic.txt etc).

Example:

currency.txt contains names of currencies (Eg: BAHT | Thailand, PESO | Mexico, YEN | Japan etc).

geographic.txt contains names of countries (Eg: Canada, China, India, Germany etc).

I want to filter all the stop words contained in the text files inside the StopWords folder, from all the text files in the cleaned_texts folder.

I looped through the stop words folder, Combined all the stop words and converted it to a list. My challenge is how to filter the stop words from my cleaned_texts files. I have been on it for days now but i couldn't figure out how to do it.

Here is my script:

import glob
import codecs
import os

#Cleaned texts
os.getcwd()
clean_texts_folder =  os.path.join(os.getcwd(), 'cleaned_texts')

clean_text_data = []
for root, folders, files in os.walk(clean_texts_folder):
    for file in files:
        path = os.path.join(root, file)
        with codecs.open(path, encoding='utf-8', errors='ignore') as info:
            clean_text_data.append(info.read())


#Stop Words
stopwords_folder_path = "StopWords"
stopwords_files = glob.glob(os.path.join(stopwords_folder_path, '*.txt'))

for file in stopwords_files:
    with open(file, 'r') as w:
        stop_words = w.read()
        
        map_dict = {'|': ''}
        res = ''.join(
            idx if idx not in map_dict else map_dict[idx] for idx in stop_words)
        new_list = res.split()

#new_list Output= ['SMITH', 'Surnames', 'from', '1990', 'Thailand', 'YEN', 'India', 'PESO', 'Japan', 'Canada']


#Trying to save the filtered texts
folder_name = "new_texts"
Path(folder).mkdir(parents=True, exist_ok=True)
filtered_sentence = []
for index, word in enumerate(clean_text_data):
    if word not in new_list:
        #print(filtered_sentence.append(word))
        file_path = Path(folder_name, f"{index}.txt")
        with pathlib.Path.open(file_path, "w", encoding="utf-8") as f:
           f.write(f"{filtered_sentence }")

Actual/Resulting Output: "None" is printing in all the text files.

a.txt = None

b.txt = None

c.txt = None

Expected Output:

a.txt = ['Rise', 'of', 'e-health', 'and', 'its']

b.txt = ['Man', 'development', 'never','year', 'date']


Solution

  • You are not correctly combining stopwords from different files. Also, you never assigned any value to filtered_sentence so you end up writing an empty list to your files, resulting in the unexpected output. Try the following instead:

    import glob
    import codecs
    import os
    from pathlib import Path
    
    # Cleaned texts
    os.getcwd()
    clean_texts_folder = os.path.join(os.getcwd(), 'cleaned_texts')
    
    # Stop Words
    stopwords_folder_path = "StopWords"
    stopwords_files = glob.glob(os.path.join(stopwords_folder_path, '*.txt'))
    
    # Combine all stop words into a single list
    stop_words = []
    for file in stopwords_files:
        with open(file, 'r', encoding='utf-8') as w:
            stop_words_in_file = [word.strip() for word in w.read().split('|')]
            stop_words.extend(stop_words_in_file)
    
    # Remove duplicates and convert to set for faster lookup
    stop_words = set(stop_words)
    
    # Loop through cleaned texts and filter out stop words
    folder_name = "new_texts"
    Path(folder_name).mkdir(parents=True, exist_ok=True)
    
    for root, folders, files in os.walk(clean_texts_folder):
        for file in files:
            path = os.path.join(root, file)
            with codecs.open(path, encoding='utf-8', errors='ignore') as info:
                content = eval(info.read())  # Convert string to list
                filtered_content = [word for word in content if word not in stop_words]
                file_path = os.path.join(folder_name, os.path.basename(path))
                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(str(filtered_content))