I have a folder named cleaned_texts. The folder contains text files(a.txt, b.txt, c.txt etc) and each text file contains tokenized words in this format:['Rise', 'of', 'e-health', 'and', 'its', 'Germany', 'dollar'].
Example:
a.txt contains ['Rise', 'of', 'e-health', 'and', 'its', 'Thailand', 'YEN', 'India'] and
b.txt contains ['PESO', 'Man', 'development', 'never', 'Japan', 'year', 'date', 'Canada'].
I also have another folder named StopWords which also contains text files and each text file contains a stop word. The text files are named in this format (currency.txt, names.txt, geographic.txt etc).
Example:
currency.txt contains names of currencies (Eg: BAHT | Thailand, PESO | Mexico, YEN | Japan etc).
geographic.txt contains names of countries (Eg: Canada, China, India, Germany etc).
I want to filter all the stop words contained in the text files inside the StopWords folder, from all the text files in the cleaned_texts folder.
I looped through the stop words folder, Combined all the stop words and converted it to a list. My challenge is how to filter the stop words from my cleaned_texts files. I have been on it for days now but i couldn't figure out how to do it.
Here is my script:
import glob
import codecs
import os
#Cleaned texts
os.getcwd()
clean_texts_folder = os.path.join(os.getcwd(), 'cleaned_texts')
clean_text_data = []
for root, folders, files in os.walk(clean_texts_folder):
for file in files:
path = os.path.join(root, file)
with codecs.open(path, encoding='utf-8', errors='ignore') as info:
clean_text_data.append(info.read())
#Stop Words
stopwords_folder_path = "StopWords"
stopwords_files = glob.glob(os.path.join(stopwords_folder_path, '*.txt'))
for file in stopwords_files:
with open(file, 'r') as w:
stop_words = w.read()
map_dict = {'|': ''}
res = ''.join(
idx if idx not in map_dict else map_dict[idx] for idx in stop_words)
new_list = res.split()
#new_list Output= ['SMITH', 'Surnames', 'from', '1990', 'Thailand', 'YEN', 'India', 'PESO', 'Japan', 'Canada']
#Trying to save the filtered texts
folder_name = "new_texts"
Path(folder).mkdir(parents=True, exist_ok=True)
filtered_sentence = []
for index, word in enumerate(clean_text_data):
if word not in new_list:
#print(filtered_sentence.append(word))
file_path = Path(folder_name, f"{index}.txt")
with pathlib.Path.open(file_path, "w", encoding="utf-8") as f:
f.write(f"{filtered_sentence }")
Actual/Resulting Output: "None" is printing in all the text files.
a.txt = None
b.txt = None
c.txt = None
Expected Output:
a.txt = ['Rise', 'of', 'e-health', 'and', 'its']
b.txt = ['Man', 'development', 'never','year', 'date']
You are not correctly combining stopwords from different files. Also, you never assigned any value to filtered_sentence
so you end up writing an empty list to your files, resulting in the unexpected output. Try the following instead:
import glob
import codecs
import os
from pathlib import Path
# Cleaned texts
os.getcwd()
clean_texts_folder = os.path.join(os.getcwd(), 'cleaned_texts')
# Stop Words
stopwords_folder_path = "StopWords"
stopwords_files = glob.glob(os.path.join(stopwords_folder_path, '*.txt'))
# Combine all stop words into a single list
stop_words = []
for file in stopwords_files:
with open(file, 'r', encoding='utf-8') as w:
stop_words_in_file = [word.strip() for word in w.read().split('|')]
stop_words.extend(stop_words_in_file)
# Remove duplicates and convert to set for faster lookup
stop_words = set(stop_words)
# Loop through cleaned texts and filter out stop words
folder_name = "new_texts"
Path(folder_name).mkdir(parents=True, exist_ok=True)
for root, folders, files in os.walk(clean_texts_folder):
for file in files:
path = os.path.join(root, file)
with codecs.open(path, encoding='utf-8', errors='ignore') as info:
content = eval(info.read()) # Convert string to list
filtered_content = [word for word in content if word not in stop_words]
file_path = os.path.join(folder_name, os.path.basename(path))
with open(file_path, "w", encoding="utf-8") as f:
f.write(str(filtered_content))