python - bag of words

I want to create a very simple bag of words based on multiple Excel-files (300).

DummyDoc1 = "This is a testdoc

DummyDoc2 = "This is also a testdoc, the second one"

...

I can import all the files and I also can do a simple wordcount (dict) for each file. What I don't get is how to combine those two in a matrix that looks something like this.

Code importing files:

def get_files(dir):
    files = [f.path for f in os.scandir(dir)]
    return files

files = get_files_ext(DIR_IN, "xlsx")

for file in files:
    file = fm.get_filename(file)
    df_all = pd.read_excel(os.path.join(DIR_IN, file))

Code wordcount:

text = open(r"..\PycharmProjects\DrillPinsBagOfWords\files_in\test.csv", "r", errors="ignore")

d = dict()
for line in text:
    
    line = line.strip()
    line = line.lower()
    words = line.split(" ")
    for word in words:
        if word in d:
            d[word] = d[word] + 1
        else:
            d[word] = 1


gesorteerd = sorted(d.items(), key=lambda x: x[1], reverse=True)

for x in gesorteerd:
     print(x)

Can someone give me some direction please?

================================================================

Here is the code I have so far. I'm still struggeling with the total dict.

import filemanager as fm
import pandas as pd

directory = r"C:\Users\files_in_test"

total_dict = dict()
files = fm.get_files_ext(directory, "csv")

count = 0
list_dict = []
for filename in files:
    d = dict()
    with open(filename, "r", errors="ignore") as text:
        count += 1
        for line in text:
            line = line.strip()
            line = line.lower()
            words = line.split(" ")
            for word in words:
                if word in d:
                    d[word] = d[word] + 1
                else:
                    d[word] = 1
    print("Print dict", count, d)
    # maak lijst van dict's
    list_dict.append(d.copy())

# print lijst van dict's
print("Print list_dict: ", list_dict)

df = pd.DataFrame(list_dict)
print(df)

result = df.transpose()
print(result)

Solution

Get all those excel files into one directory
Iterate over all files in that directory
Use the code from your wordcount to count words in every file

Use this source to export into excel format

import os

total = dict()
directory = "YOUR DIRECTORY HERE"
for filename in os.listdir(directory):
   d = dict()
   with open(filename, "r") as text:
     for line in text:
    
      line = line.strip()
      line = line.lower()
      words = line.split(" ")
      for word in words:
          if word in d:
              d[word] = d[word] + 1
          else:
              d[word] = 1
    total[filename] = d


gesorteerd = sorted(d.items(), key=lambda x: x[1], reverse=True)

for x in gesorteerd:
     print(x)