Search code examples
python-3.xscikit-learntfidfvectorizer

What is the math behind TfidfVectorizer?


I am trying to understand the math behind the TfidfVectorizer. I used this tutorial, but my code is a little bit changed:

what also says at the end that The values differ slightly because sklearn uses a smoothed version idf and various other little optimizations.

I want to be able to use TfidfVectorizer but also calculate the same simple sample by my hand.

Here is my whole code: import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import TfidfVectorizer

def main():
    documentA = 'the man went out for a walk'
    documentB = 'the children sat around the fire'
    corpus = [documentA, documentB]
    bagOfWordsA = documentA.split(' ')
    bagOfWordsB = documentB.split(' ')

    uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

    print('----------- compare word count -------------------')
    numOfWordsA = dict.fromkeys(uniqueWords, 0)
    for word in bagOfWordsA:
        numOfWordsA[word] += 1
    numOfWordsB = dict.fromkeys(uniqueWords, 0)
    for word in bagOfWordsB:
        numOfWordsB[word] += 1

    tfA = computeTF(numOfWordsA, bagOfWordsA)
    tfB = computeTF(numOfWordsB, bagOfWordsB)
    print(pd.DataFrame([tfA, tfB]))

    CV = CountVectorizer(stop_words=None, token_pattern='(?u)\\b\\w\\w*\\b')
    cv_ft = CV.fit_transform(corpus)

    tt = TfidfTransformer(use_idf=False, norm='l1')
    t = tt.fit_transform(cv_ft)
    print(pd.DataFrame(t.todense().tolist(), columns=CV.get_feature_names()))

    print('----------- compare idf -------------------')
    idfs = computeIDF([numOfWordsA, numOfWordsB])
    print(pd.DataFrame([idfs]))

    tfidfA = computeTFIDF(tfA, idfs)
    tfidfB = computeTFIDF(tfB, idfs)
    print(pd.DataFrame([tfidfA, tfidfB]))

    ttf = TfidfTransformer(use_idf=True, smooth_idf=False, norm=None)
    f = ttf.fit_transform(cv_ft)
    print(pd.DataFrame(f.todense().tolist(), columns=CV.get_feature_names()))

    print('----------- TfidfVectorizer -------------------')
    vectorizer = TfidfVectorizer(smooth_idf=False, use_idf=True, stop_words=None, token_pattern='(?u)\\b\\w\\w*\\b', norm=None)
    vectors = vectorizer.fit_transform([documentA, documentB])
    feature_names = vectorizer.get_feature_names()
    print(pd.DataFrame(vectors.todense().tolist(), columns=feature_names))


def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict


def computeIDF(documents):
    import math
    N = len(documents)

    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1

    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict


def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf


if __name__ == "__main__":
    main()

I can compare calculation of Term Frequency. Both results look the same. But when I calculate the IDF and then TF-IDF there are differences between the code from the website and TfidfVectorizer (I also try combination of CountVectorizer and TfidfTransformer to be sure it returns the same results like TfidfVectorizer does).

Code Tf-Idf results:

enter image description here

TfidfVectorizer Tf-Idf results:

enter image description here

Can anybody help me with a code that would return the same returns as TfidfVectorizer or setting of TfidfVectorizer what would return the same results as the code above?


Solution

  • Here is my improvisation of your code to reproduce TfidfVectorizer output for your data .

    
    import pandas as pd
    import numpy as np
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
    from IPython.display import display
    
    documentA = 'the man went out for a walk'
    documentB = 'the children sat around the fire'
    corpus = [documentA, documentB]
    bagOfWordsA = documentA.split(' ')
    bagOfWordsB = documentB.split(' ')
    
    uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
    
    print('----------- compare word count -------------------')
    numOfWordsA = dict.fromkeys(uniqueWords, 0)
    for word in bagOfWordsA:
        numOfWordsA[word] += 1
    numOfWordsB = dict.fromkeys(uniqueWords, 0)
    for word in bagOfWordsB:
        numOfWordsB[word] += 1
    
    series_A = pd.Series(numOfWordsA)
    series_B = pd.Series(numOfWordsB)
    df = pd.concat([series_A, series_B], axis=1).T
    df = df.reindex(sorted(df.columns), axis=1)
    display(df)
    
    tf_df = df.divide(df.sum(1),axis='index')
    
    n_d = 1+ tf_df.shape[0]
    df_d_t = 1 + (tf_df.values>0).sum(0)
    idf = np.log(n_d/df_d_t) + 1
    
    pd.DataFrame(df.values * idf,
                      columns=df.columns )
    

    enter image description here

    tfidf = TfidfVectorizer(token_pattern='(?u)\\b\\w\\w*\\b', norm=None)
    pd.DataFrame(tfidf.fit_transform(corpus).todense(),
                      columns=tfidf.get_feature_names() )
    

    enter image description here

    More details on the implementation refer the documentation here.