I want to create a very simple bag of words based on multiple Excel-files (300).
DummyDoc1 = "This is a testdoc
DummyDoc2 = "This is also a testdoc, the second one"
...
I can import all the files and I also can do a simple wordcount (dict) for each file. What I don't get is how to combine those two in a matrix that looks something like this.
Code importing files:
def get_files(dir):
files = [f.path for f in os.scandir(dir)]
return files
files = get_files_ext(DIR_IN, "xlsx")
for file in files:
file = fm.get_filename(file)
df_all = pd.read_excel(os.path.join(DIR_IN, file))
Code wordcount:
text = open(r"..\PycharmProjects\DrillPinsBagOfWords\files_in\test.csv", "r", errors="ignore")
d = dict()
for line in text:
line = line.strip()
line = line.lower()
words = line.split(" ")
for word in words:
if word in d:
d[word] = d[word] + 1
else:
d[word] = 1
gesorteerd = sorted(d.items(), key=lambda x: x[1], reverse=True)
for x in gesorteerd:
print(x)
Can someone give me some direction please?
================================================================
Here is the code I have so far. I'm still struggeling with the total dict.
import filemanager as fm
import pandas as pd
directory = r"C:\Users\files_in_test"
total_dict = dict()
files = fm.get_files_ext(directory, "csv")
count = 0
list_dict = []
for filename in files:
d = dict()
with open(filename, "r", errors="ignore") as text:
count += 1
for line in text:
line = line.strip()
line = line.lower()
words = line.split(" ")
for word in words:
if word in d:
d[word] = d[word] + 1
else:
d[word] = 1
print("Print dict", count, d)
# maak lijst van dict's
list_dict.append(d.copy())
# print lijst van dict's
print("Print list_dict: ", list_dict)
df = pd.DataFrame(list_dict)
print(df)
result = df.transpose()
print(result)
wordcount
to count words in every fileUse this source to export into excel format
import os
total = dict()
directory = "YOUR DIRECTORY HERE"
for filename in os.listdir(directory):
d = dict()
with open(filename, "r") as text:
for line in text:
line = line.strip()
line = line.lower()
words = line.split(" ")
for word in words:
if word in d:
d[word] = d[word] + 1
else:
d[word] = 1
total[filename] = d
gesorteerd = sorted(d.items(), key=lambda x: x[1], reverse=True)
for x in gesorteerd:
print(x)