Filtering stop words out of a multiple text files (using a list of stop words)

I have a folder named cleaned_texts. The folder contains text files(a.txt, b.txt, c.txt etc) and each text file contains tokenized words in this format:['Rise', 'of', 'e-health', 'and', 'its', 'Germany', 'dollar'].

Example:

a.txt contains ['Rise', 'of', 'e-health', 'and', 'its', 'Thailand', 'YEN', 'India'] and

b.txt contains ['PESO', 'Man', 'development', 'never', 'Japan', 'year', 'date', 'Canada'].

I also have another folder named StopWords which also contains text files and each text file contains a stop word. The text files are named in this format (currency.txt, names.txt, geographic.txt etc).

Example:

currency.txt contains names of currencies (Eg: BAHT | Thailand, PESO | Mexico, YEN | Japan etc).

geographic.txt contains names of countries (Eg: Canada, China, India, Germany etc).

I want to filter all the stop words contained in the text files inside the StopWords folder, from all the text files in the cleaned_texts folder.

I looped through the stop words folder, Combined all the stop words and converted it to a list. My challenge is how to filter the stop words from my cleaned_texts files. I have been on it for days now but i couldn't figure out how to do it.

Here is my script:

import glob
import codecs
import os

#Cleaned texts
os.getcwd()
clean_texts_folder =  os.path.join(os.getcwd(), 'cleaned_texts')

clean_text_data = []
for root, folders, files in os.walk(clean_texts_folder):
    for file in files:
        path = os.path.join(root, file)
        with codecs.open(path, encoding='utf-8', errors='ignore') as info:
            clean_text_data.append(info.read())


#Stop Words
stopwords_folder_path = "StopWords"
stopwords_files = glob.glob(os.path.join(stopwords_folder_path, '*.txt'))

for file in stopwords_files:
    with open(file, 'r') as w:
        stop_words = w.read()
        
        map_dict = {'|': ''}
        res = ''.join(
            idx if idx not in map_dict else map_dict[idx] for idx in stop_words)
        new_list = res.split()

#new_list Output= ['SMITH', 'Surnames', 'from', '1990', 'Thailand', 'YEN', 'India', 'PESO', 'Japan', 'Canada']


#Trying to save the filtered texts
folder_name = "new_texts"
Path(folder).mkdir(parents=True, exist_ok=True)
filtered_sentence = []
for index, word in enumerate(clean_text_data):
    if word not in new_list:
        #print(filtered_sentence.append(word))
        file_path = Path(folder_name, f"{index}.txt")
        with pathlib.Path.open(file_path, "w", encoding="utf-8") as f:
           f.write(f"{filtered_sentence }")

Actual/Resulting Output: "None" is printing in all the text files.

a.txt = None

b.txt = None

c.txt = None

Expected Output:

a.txt = ['Rise', 'of', 'e-health', 'and', 'its']

b.txt = ['Man', 'development', 'never','year', 'date']

Solution

You are not correctly combining stopwords from different files. Also, you never assigned any value to filtered_sentence so you end up writing an empty list to your files, resulting in the unexpected output. Try the following instead:

import glob
import codecs
import os
from pathlib import Path

# Cleaned texts
os.getcwd()
clean_texts_folder = os.path.join(os.getcwd(), 'cleaned_texts')

# Stop Words
stopwords_folder_path = "StopWords"
stopwords_files = glob.glob(os.path.join(stopwords_folder_path, '*.txt'))

# Combine all stop words into a single list
stop_words = []
for file in stopwords_files:
    with open(file, 'r', encoding='utf-8') as w:
        stop_words_in_file = [word.strip() for word in w.read().split('|')]
        stop_words.extend(stop_words_in_file)

# Remove duplicates and convert to set for faster lookup
stop_words = set(stop_words)

# Loop through cleaned texts and filter out stop words
folder_name = "new_texts"
Path(folder_name).mkdir(parents=True, exist_ok=True)

for root, folders, files in os.walk(clean_texts_folder):
    for file in files:
        path = os.path.join(root, file)
        with codecs.open(path, encoding='utf-8', errors='ignore') as info:
            content = eval(info.read())  # Convert string to list
            filtered_content = [word for word in content if word not in stop_words]
            file_path = os.path.join(folder_name, os.path.basename(path))
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(str(filtered_content))