Search code examples
pythonpandasnumpynltk

Header issue in generated TDM via Python


I'm having some trouble with my Python TDM. Right now it accepts a generated CSV from my other application and then creates a Term Document Matrix out of it. The current issue is that some words in the dictionary that have 0 frequency are still appearing in the header.

This is the current output So in this case words like one, simply, focus, money, etc all the way to the right should not be added/displayed at all in the created TDM file.

def termDocumentMatrix():
    # Get filenames of CSV files
    filenames = filedialog.askopenfilename(
        title="Datafluent | Open CSV files for TDM", filetypes=[("Comma Separated Value", "*.csv")]
    )

    # Check file paths
    absolute_path = os.path.dirname(__file__)
    relative_path = "temp/upload/to_tdm"
    folderdir = os.path.join(absolute_path, relative_path)

    # Set new filename for generated CSV
    new_filename = Path(filenames).stem
    new_filename = new_filename.replace(' ', '_')

    # Upload file to temp folder
    try:
        copyfile_tdm(filenames, folderdir)
    except:
        mb.showerror(title="Error!", message="File can't be Opened! Might be Wrong Format or Damaged!")

    # Read raw data from file
    data = pd.read_csv(filenames, header=None)
    tdmfile = data[0].str.cat(sep=' ')

    # Clean data by removing commas and new lines
    tdmfile = tdmfile.replace(",", "")
    tdmfile = tdmfile.replace("\\n", "")

    # Create Lemmatization Object
    lemmatizer = WordNetLemmatizer()

    # Tokenize text into sentences
    tokenizer = sent_tokenize(tdmfile)

    # Lemmatize words to get their proper meaning and remove stop words
    lemmawords = []
    for sentence in tokenizer:
        # Convert non-alphabetic characters to spaces
        sentence = re.sub('[^a-zA-Z]', ' ', sentence)
        tokens = word_tokenize(sentence.lower())
        # Remove stop words and lemmatize remaining words
        lemmawords += [lemmatizer.lemmatize(token) for token in tokens if token not in set(stopwords.words('english'))]

    # Create bag of words dictionary and filter out words with low frequency
    MIN_FREQUENCY = 2
    word_counts = Counter(lemmawords)
    dictionary = {word: i for i, word in enumerate(word_counts.keys()) if word_counts[word] >= MIN_FREQUENCY}

    # Build bag of words model
    sentence_vectors = []
    for sentence in tokenizer:
        sentence_words = set(word_counts.keys()).intersection(set(word_tokenize(sentence)))
        vector = [word_counts[word] for word in sentence_words if word in dictionary]
        sentence_vectors.append(vector)

    sentence_vectors = np.asarray(sentence_vectors)

    # Write output to CSV file
    output_path = f"{new_filename}_TDM.csv"
    with open(output_path, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        header = [word for word in dictionary.keys() if word in set(lemmawords)]
        writer.writerow(header)
        for row in sentence_vectors:
            if not all(x == 0 for x in row):
                writer.writerow(row)

    # Open output file
    os.system(f'start {output_path}')

I've tried a couple of fixes but it resulted in things getting a lot worse. I tried building the sentence_vectors and the header_Words together but no luck. I tried adjusting the header but no luck as well.


Solution

  • You can use:

    # Tokenize text into sentences
    tokenizer = sent_tokenize(tdmfile)
    
    # New code from here
    MIN_FREQUENCY = 2
    stop_words = stopwords.words('english')
    
    bags = []
    for sentence in tokenizer:
        sentence = re.sub('[^a-zA-Z]', ' ', sentence)
        tokens = [lemmatizer.lemmatize(token)
                      for token in word_tokenize(sentence.lower())
                      if token not in stop_words]
        bags.append(Counter(tokens))
    
    bags = pd.DataFrame(bags).fillna(0).astype(int)
    bags = bags.loc[: , bags.sum() > MIN_FREQUENCY]
    
    # Export to file
    bags.replace(0, '').to_csv(f'{new_filename}_TDM.csv', index=False)
    

    Output:

    >>> bags
         document  entrepreneur  businessman  choose  ten  make  given  answer  ...  attack  company  trick  cybercriminals  personal  vulnerable  sensitive  apps
    0           1             2            1       1    1     1      1       1  ...       0        0      0               0         0           0          0     0
    1           0             0            0       0    0     0      0       0  ...       0        0      0               0         0           0          0     0
    2           0             1            1       0    0     0      0       0  ...       0        0      0               0         0           0          0     0
    3           0             0            0       0    0     0      0       0  ...       0        0      0               0         0           0          0     0
    4           0             0            0       0    0     1      0       0  ...       0        0      0               0         0           0          0     0
    ..        ...           ...          ...     ...  ...   ...    ...     ...  ...     ...      ...    ...             ...       ...         ...        ...   ...
    267         0             0            0       0    0     0      0       0  ...       0        0      0               0         0           0          0     0
    268         0             0            0       0    0     0      0       0  ...       0        0      0               0         0           0          0     0
    269         0             0            0       0    0     0      0       0  ...       0        0      0               0         0           0          1     0
    270         0             0            0       0    0     0      0       0  ...       0        0      0               0         0           0          0     1
    271         0             0            0       0    0     0      0       0  ...       0        0      0               0         0           0          0     0
    
    [272 rows x 337 columns]