Search code examples
machine-learningscikit-learntext-classificationnaivebayestfidfvectorizer

How to use Tf-idf features for training your model?


from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf= True, 
                       min_df = 5, 
                       norm= 'l2', 
                       ngram_range= (1,2), 
                       stop_words ='english')

feature1 = tfidf.fit_transform(df.Rejoined_Stem)
array_of_feature = feature1.toarray()

I used the above code to get features for my text document.

from sklearn.naive_bayes import MultinomialNB # Multinomial Naive Bayes on Lemmatized Text
X_train, X_test, y_train, y_test = train_test_split(df['Rejoined_Lemmatize'], df['Product'], random_state = 0)
X_train_counts = tfidf.fit_transform(X_train)
clf = MultinomialNB().fit(X_train_counts, y_train)
y_pred = clf.predict(tfidf.transform(X_test))

Then I used this code to train my model. Can someone explain how exactly are the above features being used while training the model as that feature1 variable is not being used anywhere while training ??


Solution

  • No, you did not use feature1 as you performed another transformation X_train_count.

    Let’s go through your code in a logical flow and use only the variables that where used in the feature extraction and model training.

    # imports used
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.naive_bayes import MultinomialNB
    
    # split data random state 0 and test_size 0.25 default as you did not give the test_size
    
    X_train, X_test, y_train, y_test = train_test_split(df[['Rejoined_Lemmatize']], df['Product'], random_state = 0)
    
    # you initiated your transformer to `fit_transform` X_train, and `transform` X_test
    
    tfidf = TfidfVectorizer(sublinear_tf= True, 
                           min_df = 5, 
                           norm= 'l2', 
                           ngram_range= (1,2), 
                           stop_words ='english')
    
    
    X_train_counts = tfidf.fit_transform(X_train)
    X_test_counts = tfidf.transform(X_test)
    
    # you initiated your model and fit X_train_counts and y_train
    clf = MultinomialNB()
    cls.fit(X_train_counts, y_train)
    
    # you predicted from your transformed features
    y_pred = clf.predict(X_test_counts)
    

    There is a better way to use Scikit-learn API which eliminates confusion and will help you not get mixed up. That way uses Pipelines

    # imports used: see Pipeline
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.naive_bayes import MultinomialNB
    
    # split data random state 0 and test_size 0.25 default as you did not give the test_size
    
    X_train, X_test, y_train, y_test = train_test_split(df[['Rejoined_Lemmatize']], df['Product'], random_state = 0)
    
    # get the params
    tfidf_params = dict(sublinear_tf= True, 
                           min_df = 5, 
                           norm= 'l2', 
                           ngram_range= (1,2), 
                           stop_words ='english')
    
    # create a Pipeline that will do features transformation then pass to the model
    
    clf = Pipeline(steps=[
    ('features', TfidfVectorizer(**tfidf_params)),
    ('model', MultinomialNB())
    ])
    
    # Use clf as a model, fit X_train and y_train
    cls.fit(X_train, y_train)
    
    # predicted 
    y_pred = clf.predict(X_test)
    

    What pipeline does, in .fit is doing the fit_transform on the data, and then pass it to the model. In .predict, it will do the transform before passing to the model.

    The best the thing about this approach is that you can easily switch models or transformer with ease. Here is an example on baseline comparison of models:

    # collection to store results 
    from collections import defaultdict
    import pandas as pd
    
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    # models to test
    from sklearn.linear_model import PassiveAggressiveClassifier 
    from sklearn.linear_model import RidgeClassifierCV
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import LogisticRegressionCV     
    
    
    # insistent our storage 
    bench_mark = defaultdict(list)
    
    # split data random state 0 and test_size 0.25 default as you did not give the test_size
    
    X_train, X_test, y_train, y_test = train_test_split(df[['Rejoined_Lemmatize']], df['Product'], random_state = 0)
    
    # get the transformer params
    tfidf_params = dict(sublinear_tf= True, 
                           min_df = 5, 
                           norm= 'l2', 
                           ngram_range= (1,2), 
                           stop_words ='english')
    
    # list of models we would like to complete 
    models = [
    PassiveAggressiveClassifier(C=1e-1,max_iter=1e3,  tol=1e3), 
    RidgeClassifierCV(scoring='roc_auc', cv=10),
    LogisticRegressionCV(cv=5,solver='saga',scoring='accuracy', random_state=1, n_jobs=-1),
    SGDClassifier(loss='log', random_state=1, max_iter=101),
     ]
    
    # train, test and store each model 
    for model in models:
    
        # our pipeline is changed to accept model
        clf = Pipeline(steps=[
            ('features', TfidfVectorizer(**tfidf_params)),
            ('model', model) #just model not model() as we have done that in models list
        ])
    
        clf.fit(X_train,y_train)
         score = clf.score(X_test,y_test)
    
        model_name = clf.named_steps['model'].__class__.__name__ # hack to get name
    
        model_params = clf.named_steps['model']. get_params()
    
    
        print(f'{model_name} Scored: {score:.3f}\n')
    
        bench_mark['model_name'].append(model_name)
        bench_mark['score'].append(score)
        bench_mark['model'].append(clf)
        bench_mark['used_params'].append(model_params)
    
    # in the end, place the bench_mark to DataFrame
    models_df = pd.DataFrame(bench_mark)
    
    # now you have the trained modes in DataFrame, their scores and parameters. 
    #You can access and use any model.
    
    logistic_reg = models_df[models_df['model_name']=='LogisticRegressionCV']['model'].iloc[0]
    
    y_preds = logistic_reg.predict(X_test)
    

    Hopes this helps