Search code examples
pythoncsvexport-to-csvtf-idfsklearn-pandas

CSV file with label


As suggested here Python Tf idf algorithm I use this code to get the frequency of words over a set of documents.

import pandas as pd
import csv
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import codecs

def tokenize(text):
    tokens = word_tokenize(text)
    stems = []
    for item in tokens: stems.append(PorterStemmer().stem(item))
    return stems

with codecs.open("book1.txt",'r','utf-8') as i1,\
        codecs.open("book2.txt",'r','utf-8') as i2,\
        codecs.open("book3.txt",'r','utf-8') as i3:
    # your corpus
    t1=i1.read().replace('\n',' ')
    t2=i2.read().replace('\n',' ')
    t3=i3.read().replace('\n',' ')

    text = [t1,t2,t3]
    # word tokenize and stem
    text = [" ".join(tokenize(txt.lower())) for txt in text]
    vectorizer = TfidfVectorizer()
    matrix = vectorizer.fit_transform(text).todense()
    # transform the matrix to a pandas df
    matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
    # sum over each document (axis=0)
    top_words = matrix.sum(axis=0).sort_values(ascending=False)

    top_words.to_csv('dict.csv', index=True, float_format="%f",encoding="utf-8")

With the last line, I create a csv file where are listed all words and their frequency. Is there a way to put a label to them, to see if a word belong only to the third document, or to all? My goal is to delete from the csv file all the words that appear only in the 3rd document (book3)


Solution

  • You can use the isin() attribute to filter out your top_words in the third book from the top_ words in the entire corpus.

    (For the example below I downloaded three random books from http://www.gutenberg.org/)

    import codecs
    import pandas as pd
    from sklearn.feature_extraction.text import TfidfVectorizer
    # import nltk
    # nltk.download('punkt')
    from nltk import word_tokenize
    from nltk.stem.porter import PorterStemmer
    
    def tokenize(text):
        tokens = word_tokenize(text)
        stems = []
        for item in tokens: stems.append(PorterStemmer().stem(item))
        return stems
    
    with codecs.open("56732-0.txt",'r','utf-8') as i1,\
            codecs.open("56734-0.txt",'r','utf-8') as i2,\
            codecs.open("56736-0.txt",'r','utf-8') as i3:
        # your corpus
        t1=i1.read().replace('\n',' ')
        t2=i2.read().replace('\n',' ')
        t3=i3.read().replace('\n',' ')
    
    text = [t1,t2,t3]
    # word tokenize and stem
    text = [" ".join(tokenize(txt.lower())) for txt in text]
    vectorizer = TfidfVectorizer()
    matrix = vectorizer.fit_transform(text).todense()
    # transform the matrix to a pandas df
    matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
    # sum over each document (axis=0)
    top_words = matrix.sum(axis=0).sort_values(ascending=False)
    
    # top_words for the 3rd book alone
    text = [" ".join(tokenize(t3.lower()))]
    matrix = vectorizer.fit_transform(text).todense()
    matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
    top_words3 = matrix.sum(axis=0).sort_values(ascending=False)
    
    # Mask out words in t3
    mask = ~top_words.index.isin(top_words3.index)
    # Filter those words from top_words
    top_words = top_words[mask]
    
    top_words.to_csv('dict.csv', index=True, float_format="%f",encoding="utf-8")