I am trying to understand the math behind the TfidfVectorizer
. I used this tutorial, but my code is a little bit changed:
what also says at the end that The values differ slightly because sklearn uses a smoothed version idf and various other little optimizations.
I want to be able to use TfidfVectorizer
but also calculate the same simple sample by my hand.
Here is my whole code: import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import TfidfVectorizer
def main():
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'
corpus = [documentA, documentB]
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
print('----------- compare word count -------------------')
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
numOfWordsB[word] += 1
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)
print(pd.DataFrame([tfA, tfB]))
CV = CountVectorizer(stop_words=None, token_pattern='(?u)\\b\\w\\w*\\b')
cv_ft = CV.fit_transform(corpus)
tt = TfidfTransformer(use_idf=False, norm='l1')
t = tt.fit_transform(cv_ft)
print(pd.DataFrame(t.todense().tolist(), columns=CV.get_feature_names()))
print('----------- compare idf -------------------')
idfs = computeIDF([numOfWordsA, numOfWordsB])
print(pd.DataFrame([idfs]))
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
print(pd.DataFrame([tfidfA, tfidfB]))
ttf = TfidfTransformer(use_idf=True, smooth_idf=False, norm=None)
f = ttf.fit_transform(cv_ft)
print(pd.DataFrame(f.todense().tolist(), columns=CV.get_feature_names()))
print('----------- TfidfVectorizer -------------------')
vectorizer = TfidfVectorizer(smooth_idf=False, use_idf=True, stop_words=None, token_pattern='(?u)\\b\\w\\w*\\b', norm=None)
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
print(pd.DataFrame(vectors.todense().tolist(), columns=feature_names))
def computeTF(wordDict, bagOfWords):
tfDict = {}
bagOfWordsCount = len(bagOfWords)
for word, count in wordDict.items():
tfDict[word] = count / float(bagOfWordsCount)
return tfDict
def computeIDF(documents):
import math
N = len(documents)
idfDict = dict.fromkeys(documents[0].keys(), 0)
for document in documents:
for word, val in document.items():
if val > 0:
idfDict[word] += 1
for word, val in idfDict.items():
idfDict[word] = math.log(N / float(val))
return idfDict
def computeTFIDF(tfBagOfWords, idfs):
tfidf = {}
for word, val in tfBagOfWords.items():
tfidf[word] = val * idfs[word]
return tfidf
if __name__ == "__main__":
main()
I can compare calculation of Term Frequency. Both results look the same. But when I calculate the IDF and then TF-IDF there are differences between the code from the website and TfidfVectorizer
(I also try combination of CountVectorizer
and TfidfTransformer
to be sure it returns the same results like TfidfVectorizer
does).
Code Tf-Idf results:
TfidfVectorizer Tf-Idf results:
Can anybody help me with a code that would return the same returns as TfidfVectorizer
or setting of TfidfVectorizer
what would return the same results as the code above?
Here is my improvisation of your code to reproduce TfidfVectorizer
output for your data .
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from IPython.display import display
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'
corpus = [documentA, documentB]
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
print('----------- compare word count -------------------')
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
numOfWordsB[word] += 1
series_A = pd.Series(numOfWordsA)
series_B = pd.Series(numOfWordsB)
df = pd.concat([series_A, series_B], axis=1).T
df = df.reindex(sorted(df.columns), axis=1)
display(df)
tf_df = df.divide(df.sum(1),axis='index')
n_d = 1+ tf_df.shape[0]
df_d_t = 1 + (tf_df.values>0).sum(0)
idf = np.log(n_d/df_d_t) + 1
pd.DataFrame(df.values * idf,
columns=df.columns )
tfidf = TfidfVectorizer(token_pattern='(?u)\\b\\w\\w*\\b', norm=None)
pd.DataFrame(tfidf.fit_transform(corpus).todense(),
columns=tfidf.get_feature_names() )
More details on the implementation refer the documentation here.