python csv export-to-csv tf-idf sklearn-pandas

CSV file with label

As suggested here Python Tf idf algorithm I use this code to get the frequency of words over a set of documents.

import pandas as pd
import csv
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import codecs

def tokenize(text):
    tokens = word_tokenize(text)
    stems = []
    for item in tokens: stems.append(PorterStemmer().stem(item))
    return stems

with codecs.open("book1.txt",'r','utf-8') as i1,\
        codecs.open("book2.txt",'r','utf-8') as i2,\
        codecs.open("book3.txt",'r','utf-8') as i3:
    # your corpus
    t1=i1.read().replace('\n',' ')
    t2=i2.read().replace('\n',' ')
    t3=i3.read().replace('\n',' ')

    text = [t1,t2,t3]
    # word tokenize and stem
    text = [" ".join(tokenize(txt.lower())) for txt in text]
    vectorizer = TfidfVectorizer()
    matrix = vectorizer.fit_transform(text).todense()
    # transform the matrix to a pandas df
    matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
    # sum over each document (axis=0)
    top_words = matrix.sum(axis=0).sort_values(ascending=False)

    top_words.to_csv('dict.csv', index=True, float_format="%f",encoding="utf-8")

With the last line, I create a csv file where are listed all words and their frequency. Is there a way to put a label to them, to see if a word belong only to the third document, or to all? My goal is to delete from the csv file all the words that appear only in the 3rd document (book3)

Solution

You can use the isin() attribute to filter out your top_words in the third book from the top_ words in the entire corpus.

(For the example below I downloaded three random books from http://www.gutenberg.org/)

import codecs
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# import nltk
# nltk.download('punkt')
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer

def tokenize(text):
    tokens = word_tokenize(text)
    stems = []
    for item in tokens: stems.append(PorterStemmer().stem(item))
    return stems

with codecs.open("56732-0.txt",'r','utf-8') as i1,\
        codecs.open("56734-0.txt",'r','utf-8') as i2,\
        codecs.open("56736-0.txt",'r','utf-8') as i3:
    # your corpus
    t1=i1.read().replace('\n',' ')
    t2=i2.read().replace('\n',' ')
    t3=i3.read().replace('\n',' ')

text = [t1,t2,t3]
# word tokenize and stem
text = [" ".join(tokenize(txt.lower())) for txt in text]
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(text).todense()
# transform the matrix to a pandas df
matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
# sum over each document (axis=0)
top_words = matrix.sum(axis=0).sort_values(ascending=False)

# top_words for the 3rd book alone
text = [" ".join(tokenize(t3.lower()))]
matrix = vectorizer.fit_transform(text).todense()
matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
top_words3 = matrix.sum(axis=0).sort_values(ascending=False)

# Mask out words in t3
mask = ~top_words.index.isin(top_words3.index)
# Filter those words from top_words
top_words = top_words[mask]

top_words.to_csv('dict.csv', index=True, float_format="%f",encoding="utf-8")