I'm having some trouble with my Python TDM. Right now it accepts a generated CSV from my other application and then creates a Term Document Matrix out of it. The current issue is that some words in the dictionary that have 0 frequency are still appearing in the header.
This is the current output So in this case words like one, simply, focus, money, etc all the way to the right should not be added/displayed at all in the created TDM file.
def termDocumentMatrix():
# Get filenames of CSV files
filenames = filedialog.askopenfilename(
title="Datafluent | Open CSV files for TDM", filetypes=[("Comma Separated Value", "*.csv")]
)
# Check file paths
absolute_path = os.path.dirname(__file__)
relative_path = "temp/upload/to_tdm"
folderdir = os.path.join(absolute_path, relative_path)
# Set new filename for generated CSV
new_filename = Path(filenames).stem
new_filename = new_filename.replace(' ', '_')
# Upload file to temp folder
try:
copyfile_tdm(filenames, folderdir)
except:
mb.showerror(title="Error!", message="File can't be Opened! Might be Wrong Format or Damaged!")
# Read raw data from file
data = pd.read_csv(filenames, header=None)
tdmfile = data[0].str.cat(sep=' ')
# Clean data by removing commas and new lines
tdmfile = tdmfile.replace(",", "")
tdmfile = tdmfile.replace("\\n", "")
# Create Lemmatization Object
lemmatizer = WordNetLemmatizer()
# Tokenize text into sentences
tokenizer = sent_tokenize(tdmfile)
# Lemmatize words to get their proper meaning and remove stop words
lemmawords = []
for sentence in tokenizer:
# Convert non-alphabetic characters to spaces
sentence = re.sub('[^a-zA-Z]', ' ', sentence)
tokens = word_tokenize(sentence.lower())
# Remove stop words and lemmatize remaining words
lemmawords += [lemmatizer.lemmatize(token) for token in tokens if token not in set(stopwords.words('english'))]
# Create bag of words dictionary and filter out words with low frequency
MIN_FREQUENCY = 2
word_counts = Counter(lemmawords)
dictionary = {word: i for i, word in enumerate(word_counts.keys()) if word_counts[word] >= MIN_FREQUENCY}
# Build bag of words model
sentence_vectors = []
for sentence in tokenizer:
sentence_words = set(word_counts.keys()).intersection(set(word_tokenize(sentence)))
vector = [word_counts[word] for word in sentence_words if word in dictionary]
sentence_vectors.append(vector)
sentence_vectors = np.asarray(sentence_vectors)
# Write output to CSV file
output_path = f"{new_filename}_TDM.csv"
with open(output_path, mode='w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
header = [word for word in dictionary.keys() if word in set(lemmawords)]
writer.writerow(header)
for row in sentence_vectors:
if not all(x == 0 for x in row):
writer.writerow(row)
# Open output file
os.system(f'start {output_path}')
I've tried a couple of fixes but it resulted in things getting a lot worse. I tried building the sentence_vectors and the header_Words together but no luck. I tried adjusting the header but no luck as well.
You can use:
# Tokenize text into sentences
tokenizer = sent_tokenize(tdmfile)
# New code from here
MIN_FREQUENCY = 2
stop_words = stopwords.words('english')
bags = []
for sentence in tokenizer:
sentence = re.sub('[^a-zA-Z]', ' ', sentence)
tokens = [lemmatizer.lemmatize(token)
for token in word_tokenize(sentence.lower())
if token not in stop_words]
bags.append(Counter(tokens))
bags = pd.DataFrame(bags).fillna(0).astype(int)
bags = bags.loc[: , bags.sum() > MIN_FREQUENCY]
# Export to file
bags.replace(0, '').to_csv(f'{new_filename}_TDM.csv', index=False)
Output:
>>> bags
document entrepreneur businessman choose ten make given answer ... attack company trick cybercriminals personal vulnerable sensitive apps
0 1 2 1 1 1 1 1 1 ... 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0
2 0 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0
4 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0
.. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
267 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0
268 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0
269 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0
270 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1
271 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0
[272 rows x 337 columns]