Search code examples
pythonscikit-learnpipelinesentiment-analysisfeature-extraction

How to add a feature using a pipeline and FeatureUnion


In the code below I use a tweeter dataset to perform sentiment analysis. I use a pipeline which performs the following processes:

1) performs some basic text preprocessing

2) vectorizes the tweet text

3) adds an extra feature ( text length)

4) classification

I would like to add one more feature which is the scaled number of followers. I wrote a function that takes as an input the whole dataframe (df) and returns a new dataframe with scaled number of followers. However, I am finding it challenging to add this process on the pipeline e.g. add this feature to the other features using the sklearn pipeline.

Any help or advise on this problem will be much appreciated.

the question and code below is inspired by Ryan's post:pipelines


import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

def import_data(filename,sep,eng,header = None,skiprows=1):
    #read csv
    dataset = pd.read_csv(filename,sep=sep,engine=eng,header = header,skiprows=skiprows)
    #rename columns
    dataset.columns = ['text','followers','sentiment']
    return dataset

df = import_data('apple_v3.txt','\t','python')
X, y = df.text, df.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y)

tokenizer = nltk.casual.TweetTokenizer(preserve_case=False, reduce_len=True)
count_vect = CountVectorizer(tokenizer=tokenizer.tokenize) 
classifier = LogisticRegression()

def get_scalled_followers(df):
    scaler = MinMaxScaler()
    df[['followers']] = df[['followers']].astype(float)
    df[['followers']] = scaler.fit_transform(df[['followers']])
    followers = df['followers'].values
    followers_reshaped = followers.reshape((len(followers),1))
    return df

def get_tweet_length(text):
    return len(text)
import numpy as np

def genericize_mentions(text):
    return re.sub(r'@[\w_-]+', 'thisisanatmention', text)

def reshape_a_feature_column(series):
    return np.reshape(np.asarray(series), (len(series), 1))

def pipelinize_feature(function, active=True):
    def list_comprehend_a_function(list_or_series, active=True):
        if active:
            processed = [function(i) for i in list_or_series]
            processed = reshape_a_feature_column(processed)
            return processed

        else:
            return reshape_a_feature_column(np.zeros(len(list_or_series)))

from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn_helpers import pipelinize, genericize_mentions, train_test_and_evaluate


sentiment_pipeline = Pipeline([
        ('genericize_mentions', pipelinize(genericize_mentions, active=True)),
        ('features', FeatureUnion([
                    ('vectorizer', count_vect),
                    ('post_length', pipelinize_feature(get_tweet_length, active=True))
                ])),
        ('classifier', classifier)
    ])

sentiment_pipeline, confusion_matrix = train_test_and_evaluate(sentiment_pipeline, X_train, y_train, X_test, y_test)


Solution

  • The best explanation I have found so far is at the following post: pipelines

    My data includes heterogenous features and the following step by step approach works well and is easy to understand:

    from sklearn.base import BaseEstimator, TransformerMixin
    from sklearn.pipeline import Pipeline, FeatureUnion
    
    #step1 - select data from dataframe and split the dataset in train and test sets
    
    features= [c for c in df.columns.values if c  not in ['sentiment']]
    numeric_features= [c for c in df.columns.values if c  not in ['text','sentiment']]
    target = 'sentiment'
    
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.33, random_state=42)
    
    #step2 - create a number selector class and text selector class. These classes allow to select specific columns from the dataframe
    
    class NumberSelector(BaseEstimator, TransformerMixin):
    
        def __init__(self, key):
            self.key = key
    
        def fit(self, X, y=None):
            return self
    
        def transform(self, X):
            return X[[self.key]]
    
    class TextSelector(BaseEstimator, TransformerMixin):
    
        def __init__(self, key):
            self.key = key
    
        def fit(self, X, y=None):
            return self
    
        def transform(self, X):
            return X[self.key]
    
    #step 3 create one pipeline for the text data and one for the numerical data
    
    
    text = Pipeline([
                    ('selector', TextSelector(key='content')),
                    ('tfidf', TfidfVectorizer( stop_words='english'))
                ])
    
    text.fit_transform(X_train)
    
    followers =  Pipeline([
                    ('selector', NumberSelector(key='followers')),
                    ('standard', MinMaxScaler())
                ])
    
    followers.fit_transform(X_train)
    
    #step 4 - features union
    
    feats = FeatureUnion([('text', text), 
                          ('length', followers)])
    
    feature_processing = Pipeline([('feats', feats)])
    feature_processing.fit_transform(X_train)
    
    # step 5 - add the classifier and predict 
    
    pipeline = Pipeline([
        ('features',feats),
        ('classifier', SVC(kernel = 'linear', probability=True, C=1, class_weight = 'balanced'))
    ])
    
    pipeline.fit(X_train, y_train)
    
    preds = pipeline.predict(X_test)
    np.mean(preds == y_test)
    
    # step 6 use the model to predict new data not included in the test set
    # in my example the pipeline expects a dataframe as an input which should have a column called 'text' and a column called 'followers'
    
    array = [["@apple is amazing",25000]]
    dfObj = pd.DataFrame(array,columns = ['text' , 'followers']) 
    
    #prints the expected class e.g. positive or negative sentiment
    print(pipeline.predict(dfObj))
    
    #print the probability for each class
    print(pipeline.predict_proba(dfObj))